In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
# Standardizing data

def standard_units(x):
    return (x - np.mean(x))/np.std(x)

In [None]:
# Functions needed to create classifications based on the distance to similar data. 

def distance(point1, point2):
    """The distance between two arrays of numbers."""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, point):
    """The distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(point, np.array(row))
    return attributes.apply(distance_from_point)

def table_with_distances(training, point):
    """A copy of the training table with the distance from each row to array p."""
    return training.with_column('Distance', all_distances(training, point))

def closest(training, point, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_distances(training, point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

def majority(topkclasses):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, p, k):
    """Classify an example with attributes p using k-nearest neighbor classification with the given training table."""
    closestk = closest(training, p, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

def show_closest(point):
    """point = array([x,y]) 
    gives the coordinates of a new point
    shown in red"""
    
    HemoGl = ckd.drop('White Blood Cell Count', 'Color')
    t = closest(HemoGl, point, 1)
    x_closest = t.row(0).item(1)
    y_closest = t.row(0).item(2)
    ckd.scatter('Hemoglobin', 'Glucose', group='Color')
    plt.scatter(point.item(0), point.item(1), color='red', s=30)
    plt.plot(make_array(point.item(0), x_closest), make_array(point.item(1), y_closest), color='k', lw=2);

In [None]:
# Functions to plot data in a particular way. 

def plot_all_points(test_grid):
    test_grid.scatter('Hemoglobin', 'Glucose', color='red', alpha=0.4, s=30)

    plt.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Color'), edgecolor='k')

    plt.xlim(-2, 2)
    plt.ylim(-2, 2);
    
def classify_grid(training, test, k):
    c = make_array()
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = np.append(c, classify(training, make_array(test.row(i)), k))   
    return c

def plot_all_points_classified(test_grid):
    c = classify_grid(ckd.drop('White Blood Cell Count', 'Color'), test_grid, 1)
    test_grid = test_grid.with_column('Class', c).join('Class', color_table)
    test_grid.scatter('Hemoglobin', 'Glucose', group='Color', alpha=0.4, s=30)

    plt.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Color'), edgecolor='k')

    plt.xlim(-2, 2)
    plt.ylim(-2, 2);

# Lecture 35 Classification

### Making Predications
Based on incomplete information.

One way of making predictions: </br>
To predict an outcome for an individual, find others who are like that individual and whose outcomes you know. 
Use those outcomes as the basis of your prediction.

***Two Types of Prediction*** </br>
***Regression = Numeric; Classification = Categorical***


## Classification Examples
Predicting Categorical Data. (non numeric)

### Classifying Patients ###

Chronic Kidney Disease (CKD) Predictions </br>
Class of 1: Patient has CKD</br>
Class of 0: Patient does not have CKD

In [None]:
# Load in patient records from a particular hospital. 
# Mixed data both numeric and categorical. 
# We know the class which ultimately is what we want to predict. 
# Use this data as a model to make predictions about someone new. 

ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
# Generate a small table that shows have many patients have CKD and how many don't.
# Do not reassign. 



In [None]:
# Generate a scatter comparing the White Blood Cell Count and Glucose grouped by Class.



What are some observations you can make about this comparison?

Could you predict if a new person has CKD? How would you do that?


In [None]:
# Generate a scatter comparing the Hemoglobin and Glucose grouped by Class.



What are some observations you can make about this comparison?

Could you predict if a new person has CKD? How would you do that?

In [None]:
# We want to be able to way to predict the class of someone
# without having to plot & eye ball this graph every time.
#
# One way to do this is to put some thresholds into code

max_glucose_for_0 = ckd.where('Class',are.equal_to(0)).column('Glucose').max()
min_hemoglobin_for_0 = ckd.where('Class',are.equal_to(0)).column('Hemoglobin').min()
min_hemoglobin_for_0, max_glucose_for_0

# What is this code doing?

# Define the values that are generated by this code.

In [None]:
# Function to classify CKD based on hemoglobing and glucose measures. 

def classify_manually(hemoglobin, glucose):
    if hemoglobin < min_hemoglobin_for_0 or glucose > max_glucose_for_0:
        return 1
    else:
        return 0

In [None]:
# Let's try our classifier! Test the function for someone who has 100 glucose rating and 15 hemoglobin rating.



In [None]:
# Test the function for someone who has 15 hemoglobin rating and 300 glucose rating



In [None]:
# Test the function for someone who has 15 hemoglobin rating and 300 glucose rating



Look at the coordinate plane for these values. What do you notice about the coordinates in the previous 3 problems?

What do you notice around the coordinates with 12.5 hemoglobin and 110 glucose rating?

What might happen?

## Classifying Banknotes ##

Banknotes are paper currency. </br>

Predict whether a banknote is fraudulant or not. </br>
Class of 0: Not fraudulant banknote</br>
Class of 1: Fraudulant banknote

In [None]:
# Import banknotes data. Shows various measures by a fraud expert 
# and the resulting classification. 

banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
# Generate a small table that shows have many banknotes are fraudulant and how many are not.
# Do not reassign. 



In [None]:
# Generate a scatter comparing the WaveletVar and WaveletCurt grouped by Class.



What are some observations you can make about this comparison?

Could you predict if a new banknote is fraudulant? How would you do that?

In [None]:
# Generate a scatter comparing the WaveletSkew and Entropy grouped by Class.



What are some observations you can make about this comparison?

Could you predict if a new banknote is fraudulant? How would you do that?

### Using More Dimensions to View the Data

Some data in two dimensions leaves out interesting information. 

Increasing the dimensions may reveal additional information that helps with classification. 

In [None]:
# Produces a 3-Dimensional graph.
# What variables are being used for each dimension?
# Identify them as x, y, z.

'''fig = plt.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'),
           cmap='viridis',
          s=50);
'''
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
banknotes = banknotes.join('Class', color_table)

ax = plt.figure(figsize=(8,8)).add_subplot(111, projection='3d')
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Color'));

How does this visualization help classify fraudulant banknotes?

Identify the variable for each axis.

See if you can visually identify border values.


## Nearest Neighbor Classifer

Creating a model to classify information based on the "nearest neighbors" in the data. 

Split the sample data into a Training Set and a Test Set. </br>
Use the traning set to train the classifier. These are the neighbors. </br>
Use the test set to make sure the classifier is properly assigning the class based on known information.</br>

This avoids bias in the classifier. 

In [None]:
# Convert CKD features into standard units
ckd = Table().with_columns(
    'Hemoglobin', standard_units(ckd.column('Hemoglobin')),
    'Glucose', standard_units(ckd.column('Glucose')),
    'White Blood Cell Count', standard_units(ckd.column('White Blood Cell Count')),
    'Class', ckd.column('Class')
)

In [None]:
# Recall Class of 0: No CKD, Class of 1: CKD

color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)
ckd = ckd.join('Class', color_table)
ckd

In [None]:
# Generate a scatter that compares Hemoglobin and Glucose grouped by color. 



In [None]:
# New Patient 
# In this example, Alice's Hemoglobin is 0 and her Glucose is 1.5.

alice = make_array(0, 1.5)
show_closest(alice)

## Decision Boundary

In [None]:
# New Patient 
# In this example, Alice's Hemoglobin is 0 and her Glucose is 1.5.

alice = make_array(0, 0.95)
show_closest(alice)

In [None]:
# Create a grid of all points
x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 2.1, 0.1):
    for y in np.arange(-2, 2.1, 0.1):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)

test_grid = Table().with_columns(
    'Hemoglobin', x_array,
    'Glucose', y_array
)

In [None]:
# Show all the points of the plane before classification.

plot_all_points(test_grid)

In [None]:
# Show all points on the plane classified. 

plot_all_points_classified(test_grid)

This is a simplified classifier. 

Are you able to classify a new patient based on thei Hemoglobin and Glucose using this grid?
Why or Why not?