In [1]:
from csv import reader
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import random

In [2]:
# Read the CSV file
dataset = pd.read_csv("./breast+cancer/breast-cancer.data.csv", header=None)
dataset = dataset.mask(dataset == '')
dataset = dataset.fillna(0)
dataset = dataset.values.tolist()

In [10]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = {value: index for index, value in enumerate(unique)}
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

for i in range(len(dataset[0])):
    str_column_to_int(dataset, i)


print("First 5 rows are Mapping:")
for i in range(5):
    print(dataset[i])

First 5 rows are Mapping:
[1, 1, 2, 9, 1, 2, 2, 0, 3, 1]
[1, 2, 2, 1, 1, 2, 1, 1, 2, 1]
[1, 2, 2, 1, 1, 2, 1, 0, 3, 1]
[1, 4, 0, 5, 1, 2, 1, 1, 5, 1]
[1, 2, 2, 8, 1, 2, 1, 1, 4, 1]


In [16]:
random.seed(20)
# Extract features and target variable
X = np.array([row[:-1] for row in dataset])
y = np.array([row[-1] for row in dataset])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Large k-value 
large_k = int(sqrt(len(dataset)))
if large_k % 2 == 1: large_k += 1

knn = KNeighborsClassifier(n_neighbors=3)
knn_large_k = KNeighborsClassifier(n_neighbors=large_k)
n_splits = 10

kf = KFold(n_splits=n_splits, shuffle=True)
cv_scores = cross_val_score(knn, X_scaled, y, cv=kf)
cv_scores_large_k = cross_val_score(knn_large_k, X_scaled, y, cv=kf)

print("Accuracy Scores for 10 Folds (k=3):", ", ".join(map(str, cv_scores)), "\n")
print("Accuracy Scores for 10 Folds(k=sqrt(len(dataset))):", ",".join(map(str, cv_scores_large_k*100)), "\n")

average_accuracy = cv_scores.mean()
average_accuracy_large_k = cv_scores_large_k.mean()

print("\nAverage Accuracy (Percentage)(k=3): {:.4f}".format(average_accuracy * 100))
print("Average Accuracy (Percentage)(k=sqrt(dataset_size)): {:.4f}".format(average_accuracy_large_k * 100))

Accuracy Scores for 10 Folds (k=3): 0.896551724137931, 0.6896551724137931, 0.8275862068965517, 0.6551724137931034, 0.7931034482758621, 0.6896551724137931, 0.7142857142857143, 0.7857142857142857, 0.8214285714285714, 0.7857142857142857 

Accuracy Scores for 10 Folds(k=sqrt(len(dataset))): 86.20689655172413,79.3103448275862,86.20689655172413,72.41379310344827,75.86206896551724,72.41379310344827,89.28571428571429,53.57142857142857,85.71428571428571,78.57142857142857 


Average Accuracy (Percentage)(k=3): 76.5887
Average Accuracy (Percentage)(k=sqrt(dataset_size)): 77.9557
