In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
ckd = Table.read_table('ckd.csv')
ckd = ckd.relabeled('Blood Glucose Random', 'Glucose').select('Glucose', 'Hemoglobin', 'White Blood Cell Count', 'Class')

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')

def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size=len(a))

jittered = Table().with_columns([
        'Bland Chromatin (jittered)', 
        randomize_column(patients.column('Bland Chromatin')),
        'Single Epithelial Cell Size (jittered)', 
        randomize_column(patients.column('Single Epithelial Cell Size')),
        'Class',
        patients.column('Class')
    ])

# Google Science Fair

Brittany Wenger, a 17-year-old high school student in 2012
 won by building a breast cancer classifier with 99% accuracy. 
 
After imaging, technicians went thru the images and measured certain attributes that helped determine whether the patient had breast cancer. 

Class of 0: Does not have cancer </br>
Class of 1: Does have cancer

In [None]:
# Import the patient data used to determine they had breast cancer. 

patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)

In [None]:
# Generate a small table that shows have many patients have cancer and how many do not.
# Do not reassign. 

patients.group('Class')

In [None]:
# Generate a scatter comparing the Bland Chromatin and Single Epithelial Cell Size grouped by Class.

patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', group='Class')

Does our graph look like it represents all 680+ patients?

Why do you think that is?

In [None]:
# Add 'noise' to the data points to reveal hidden values. 

jittered.scatter(0, 1, group='Class')

## Distance ##
Use the distance formula from Pythagoras to determine the distance between two points. 


In [None]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

In [None]:
def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

In [None]:
row_distance(attributes.row(2), attributes.row(2))

# The Classifier

Use the difference formula to classify an incoming patient based on its closeness to other similar points. 

You can specify how many points to consider as neighbors. We are calling that value $k$.

The assigned class will be based on the classification of the majority of the $k$ neighbors. 

We can test our classifer by taking a known record and running it through to see if it is correctly classified. 

In [None]:
def distances(training, example):
    """
    Compute distance between example and every row in training.
    Return training augmented with Distance column
    """
    distances = make_array()
    attributes_only = training.drop('Class')
    
    for row in attributes_only.rows:
        distances = np.append(distances, row_distance(row, example))
    
#   ^ SAME AS DOING:
#
#   for i in np.arange(attributes_only.num_rows):
#       row = attributes_only.row(i)
#       distances = np.append(distances, row_distance(row, example))
        
    return training.with_column('Distance_to_ex', distances)

In [None]:
example = attributes.row(21)
example

In [None]:
distances(patients.exclude(21), example).sort('Distance_to_ex')

In [None]:
def closest(training, example, k):
    """
    Return a table of the k closest neighbors to example
    """
    return distances(training, example).sort('Distance_to_ex').take(np.arange(k))

In [None]:
closest(patients.exclude(21), example, 5)

In [None]:
closest(patients.exclude(21), example, 5).group('Class').sort('count', descending=True)

In [None]:
def majority_class(topk):
    """
    Return the class with the highest count
    """
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

In [None]:
def classify(training, example, k):
    """
    Return the majority class among the 
    k nearest neighbors of example
    """
    return majority_class(closest(training, example, k))

In [None]:
classify(patients.exclude(21), example, 5)

In [None]:
patients.take(21)

In [None]:
new_example = attributes.row(10)
classify(patients.exclude(10), new_example, 5)

In [None]:
patients.take(10)

In [None]:
another_example = attributes.row(15)
classify(patients.exclude(15), another_example, 15)

In [None]:
patients.take(15)

How many of the above were correctly classified?

Would you want to use this to classify new patients? Why or Why not.


## Review of the Steps ##

- `distance(pt1, pt2)`: Returns the distance between the arrays `pt1` and `pt2`
- `row_distance(row1, row2)`: Returns the distance between the rows `row1` and `row2`
- `distances(training, example)`: Returns a table that is `training` with an additional column `'Distance'` that contains the distance between `example` and each row of `training`
- `closest(training, example, k)`: Returns a table of the rows corresponding to the k smallest distances 
- `majority_class(topk)`: Returns the majority class in the `'Class'` column
- `classify(training, example, k)`: Returns the predicted class of `example` based on a `k` nearest neighbors classifier using the historical sample `training`

## Accuracy of a Classifier ##

Create a function that will return the proportion of correctly classified examples of a test set. 

Check the classifier with varying values of $k$.

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set  = shuffled.take(np.arange(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 5)

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
evaluate_accuracy(training_set, test_set, 11)

In [None]:
evaluate_accuracy(training_set, test_set, 7)

In [None]:
evaluate_accuracy(training_set, test_set, 9)


In [None]:
evaluate_accuracy(training_set, test_set, 1)

Which value of $k$ gave you the most accurate classifier?

If you rerun the cells above, does the same value of $k$ remain the best one?



# Standardize if Necessary

With the breast cancer data the numbers used to measure attributes of the cells are very similar in range. 

With the CKD data some of the measures are very different. </br>
For example Glocose ranges from 70-140 mg/dL but Hemoglobin ranges from 11 to 8 grams. 

Stanardizing will help very different values to act more similar. 

In [None]:
#looking at very different values may skew the classification. So standardization allows comparison to the mean.
ckd.show(3)

# Before Standardizing. 

In [None]:
shuffled = ckd.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(79))
test_set  = shuffled.take(np.arange(79, 158))

In [None]:
#Check the accuracy before standardization.

evaluate_accuracy(training_set, test_set, 3)

In [None]:
def standard_units(x):
    return (x - np.average(x)) / np.std(x)

In [None]:
ckd_new = ckd.select('Class').with_columns(
    'Glucose_su', standard_units(ckd.column('Glucose')),
    'Hemoglobin_su', standard_units(ckd.column('Hemoglobin')),
    'WBC_su', standard_units(ckd.column('White Blood Cell Count'))
)

In [None]:
ckd_new
# After Standardizing

In [None]:
shuffled = ckd_new.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(79))
test_set  = shuffled.take(np.arange(79, 158))

In [None]:
#Check the accuracy after standardization.

evaluate_accuracy(training_set, test_set, 3)

In [None]:
#Will it improve the breast cancer classifier?

patients.take(15)


In [None]:
patients_new = patients.select('Class').with_columns(
    'Clump_Thickness', standard_units(patients.column('Clump Thickness')),
    'Uniformity1', standard_units(patients.column('Uniformity of Cell Size')),
    'Uniformity2', standard_units(patients.column('Uniformity of Cell Shape')),
    'Marginal', standard_units(patients.column('Marginal Adhesion')),
    'Epithelial', standard_units(patients.column('Single Epithelial Cell Size')),
    'Nuclei', standard_units(patients.column('Bare Nuclei')),
    'Chromatin', standard_units(patients.column('Bland Chromatin')),
    'Nucleoli', standard_units(patients.column('Normal Nucleoli')),
    'Mitoses', standard_units(patients.column('Mitoses'))
)
patients_new

In [None]:
shuffled = patients_new.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set  = shuffled.take(np.arange(342, 683))

In [None]:
evaluate_accuracy(training_set, test_set, 5)

Was the standardized data better than your best classifier without standarization?