In [1]:
import pandas as pd
import math
from collections import Counter
# import time as t
from tqdm import tqdm 

In [2]:
data = pd.read_csv('Breast_Cancer.csv')

In [3]:
def compute_distances(point1, point2):
    """
        Age, survival months, regional node positive, regional node examined and Tumor size
        are the continuous variables. Euclidean distance is used for measuring similarity 
        between these variables.

        Race, Marital Status, T Stage, N Stage, 6th Stage, Defferentiated, Grade, A Stage, 
        Estrogen Status, Progesterone Status are the categorical values. Hamming distance 
        is used for measuring the similarity across these variables.
    """
    euclidean_distance = math.sqrt(
        (point1['Age'] - point2['Age'] ) **2+
        (point1['Tumor Size'] - point2['Tumor Size'] ) **2+
        (point1['Regional Node Examined'] - point2['Regional Node Examined']) **2+
        (point1['Regional Node Positive'] - point2['Regional Node Positive'] ) **2+
        (point1['Survival Months'] - point2['Survival Months'])**2
    )

    # print("ed: ", euclidean_distance)

    

    hamming_distance = (
        (0 if point1['Race']==point2['Race'] else 1) + 
        (0 if point1['Marital Status']==point2['Marital Status'] else 1) +
        (0 if point1['T Stage']==point2['T Stage'] else 1) +
        (0 if point1['N Stage']==point2['N Stage'] else 1) +
        (0 if point1['6th Stage']==point2['6th Stage'] else 1) +
        (0 if point1['differentiate']==point2['differentiate'] else 1) +
        (0 if point1['Grade']==point2['Grade'] else 1) +
        (0 if point1['A Stage']==point2['A Stage'] else 1) +
        (0 if point1['Estrogen Status']==point2['Estrogen Status'] else 1) +
        (0 if point1['Progesterone Status']==point2['Progesterone Status'] else 1) 
    )
    
    
    # print("hd: ", hamming_distance)
    return euclidean_distance + hamming_distance

In [4]:
def split_dataset(data):
    totalRows = data.shape[0] - 1

    """
        split data into train, validation and testing sets : 75-15-15% each
        find the total size of the dataset and *0.75, .15, .15
    """

    train_boundary = math.floor(0.70*totalRows)
    val_boundary = train_boundary + math.ceil(0.15*totalRows)
    test_boundary = val_boundary + math.ceil(0.15*totalRows)

    train_data = data.iloc[:train_boundary]
    val_data = data.iloc[train_boundary:val_boundary]
    test_data = data.iloc[val_boundary:test_boundary]

    train_Y = train_data['Status']
    train_X = train_data.drop(['Status'], axis=1)

    val_Y = val_data['Status']
    val_X = val_data.drop(['Status'], axis=1)


    test_Y = test_data['Status']
    test_X = test_data.drop(['Status'], axis=1)

    # print(train_X.shape[0])
    # print(val_X.shape[0])

    return train_X, train_Y, val_X, val_Y, test_X, test_Y

In [13]:
train_X, train_Y, val_X, val_Y, test_X, test_Y = split_dataset(data=data)

In [10]:
def parameter_tuning_knn(data, k):
    
    train_X, train_Y, val_X, val_Y, test_X, test_Y = split_dataset(data=data)

    val_pred = {}
    point_distance_map = {}

    point_point_map = {}

    for val_index in tqdm(range(list(val_X.shape)[0])):

        """
            for every point in the validation dataset, find the k nearest neighbours by computing distances,
            map and store distances to its training point, sort the map, and look at the first k points.
        """


        # print("Iterating at val_point: ", val_index)

        for train_index in range(list(train_X.shape)[0]):
            # print("Iterating at train_point: ", train_index)
            distance_list=[]
            point_list=[]
            # print(val_index, train_index)
            distance = compute_distances(val_X.iloc[val_index], train_X.iloc[train_index])
            point_distance_map[train_index] = distance

    
        sorted_distances_point_map = dict(sorted(point_distance_map.items(), key=lambda item: item[1]))
        
        """
            while looking at the first k points, find the most occuring 'Status' value among them, using
            train_Y and report it as the output for that val_point
        """


        counter = 0
        output = []
        for pair in sorted_distances_point_map.items():
            if(counter>=k): break                           # already found K neighbours
            # print(pair)
            point_number, distance = pair
            output.append(train_Y.iloc[point_number])
            # print(train_Y.iloc[point_number])
            counter+=1
        
        pred_status, trash = Counter(output).most_common()[0]
        # print(pred_status)
        val_pred[val_index]=pred_status
        
    return val_pred

In [11]:
val_pred = parameter_tuning_knn(data, 3)

100%|██████████| 604/604 [01:55<00:00,  5.22it/s]


In [14]:
val_Y

2816    Alive
2817    Alive
2818    Alive
2819     Dead
2820    Alive
        ...  
3415    Alive
3416    Alive
3417     Dead
3418     Dead
3419    Alive
Name: Status, Length: 604, dtype: object

In [19]:
validation_predictions = {}
for pair in val_pred.items():
    validation_predictions[pair[0]+2816]=pair[1]



In [20]:
validation_predictions

{2816: 'Alive',
 2817: 'Alive',
 2818: 'Alive',
 2819: 'Dead',
 2820: 'Dead',
 2821: 'Alive',
 2822: 'Alive',
 2823: 'Alive',
 2824: 'Alive',
 2825: 'Alive',
 2826: 'Alive',
 2827: 'Alive',
 2828: 'Alive',
 2829: 'Alive',
 2830: 'Alive',
 2831: 'Alive',
 2832: 'Alive',
 2833: 'Alive',
 2834: 'Alive',
 2835: 'Alive',
 2836: 'Alive',
 2837: 'Alive',
 2838: 'Alive',
 2839: 'Alive',
 2840: 'Alive',
 2841: 'Alive',
 2842: 'Alive',
 2843: 'Alive',
 2844: 'Alive',
 2845: 'Alive',
 2846: 'Alive',
 2847: 'Alive',
 2848: 'Alive',
 2849: 'Alive',
 2850: 'Alive',
 2851: 'Alive',
 2852: 'Dead',
 2853: 'Alive',
 2854: 'Alive',
 2855: 'Alive',
 2856: 'Alive',
 2857: 'Alive',
 2858: 'Alive',
 2859: 'Alive',
 2860: 'Alive',
 2861: 'Alive',
 2862: 'Alive',
 2863: 'Alive',
 2864: 'Alive',
 2865: 'Alive',
 2866: 'Alive',
 2867: 'Alive',
 2868: 'Alive',
 2869: 'Alive',
 2870: 'Alive',
 2871: 'Alive',
 2872: 'Dead',
 2873: 'Alive',
 2874: 'Alive',
 2875: 'Alive',
 2876: 'Alive',
 2877: 'Alive',
 2878: 'Aliv

In [21]:
nocp = 0

In [25]:
for point in validation_predictions:
    if validation_predictions[point] == val_Y[point]:
        nocp+=1
        

In [29]:
accuracy_3 = nocp/604

In [33]:
list_of_accuracies = {}

In [31]:
len(val_pred)

604

In [37]:
for k in tqdm([1,3,5,7], disable=True):
    print("Running for k= ", k)
    val_pred = parameter_tuning_knn(data, k)
    
    validation_predictions = {}
    for pair in val_pred.items():
        validation_predictions[pair[0]+2816]=pair[1]

    nocp = 0
    for point in validation_predictions:
        if validation_predictions[point] == val_Y[point]:
            nocp+=1
    list_of_accuracies[k] = nocp/len(val_pred)

Running for k=  1


100%|██████████| 604/604 [01:56<00:00,  5.19it/s]


Running for k=  3


100%|██████████| 604/604 [01:57<00:00,  5.16it/s]


Running for k=  5


100%|██████████| 604/604 [01:56<00:00,  5.19it/s]


Running for k=  7


100%|██████████| 604/604 [01:57<00:00,  5.16it/s]


In [38]:
list_of_accuracies

{1: 0.8576158940397351,
 3: 0.8973509933774835,
 5: 0.9089403973509934,
 7: 0.8940397350993378}