In [29]:
from csv import reader
import pandas as pd
import numpy as np
from numpy import array
from math import sqrt
from random import randrange
import copy
import random

In [30]:
# Read the CSV file
dataset = pd.read_csv("./breast+cancer/breast-cancer.data.csv", header=None)
dataset = dataset.mask(dataset == '')
dataset = dataset.fillna(0)
dataset = dataset.values.tolist()

In [31]:
# Mapping for breast cancer dataset classes
# 'no-recurrence-events' and 'recurrence-events' -> (0, 1)

# Mapping for age attribute
# '10-19': 0, '20-29': 1, '30-39': 2, '40-49': 3, '50-59': 4, '60-69': 5, '70-79': 6, '80-89': 7, '90-99': 8

# Mapping for menopause attribute
# 'lt40': 0, 'ge40': 1, 'premeno': 2

# Mapping for tumor-size attribute
# '0-4': 0, '5-9': 1, '10-14': 2, '15-19': 3, '20-24': 4, '25-29': 5, '30-34': 6, '35-39': 7, '40-44': 8, '45-49': 9, '50-54': 10, '55-59': 11

# Mapping for inv-nodes attribute
# '0-2': 0, '3-5': 1, '6-8': 2, '9-11': 3, '12-14': 4, '15-17': 5, '18-20': 6, '21-23': 7, '24-26': 8, '27-29': 9, '30-32': 10, '33-35': 11, '36-39': 12

# Mapping for node-caps attribute
# 'yes': 0, 'no': 1

# Mapping for deg-malig attribute
# '1': 0, '2': 1, '3': 2

# Mapping for breast attribute
# 'left': 0, 'right': 1

# Mapping for breast-quad attribute
# 'left-up': 0, 'left-low': 1, 'right-up': 2, 'right-low': 3, 'central': 4

# Mapping for irradiat attribute
# 'yes': 0, 'no': 1


def str_column_to_int(dataset, column):
    unique_values = set(row[column] for row in dataset)
    lookup = {value: index for index, value in enumerate(unique_values)}
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

for column in range(len(dataset[0])):
    str_column_to_int(dataset, column)
    
print(dataset)

[[0, 3, 1, 5, 3, 2, 2, 1, 4, 1], [0, 2, 1, 0, 3, 2, 1, 0, 3, 1], [0, 2, 1, 0, 3, 2, 1, 1, 4, 1], [0, 1, 0, 9, 3, 2, 1, 0, 2, 1], [0, 2, 1, 10, 3, 2, 1, 0, 5, 1], [0, 1, 0, 9, 3, 2, 1, 1, 4, 1], [0, 4, 1, 1, 3, 2, 1, 1, 4, 1], [0, 1, 0, 0, 3, 2, 0, 1, 4, 1], [0, 2, 1, 2, 3, 2, 1, 1, 4, 1], [0, 2, 1, 0, 3, 2, 1, 0, 2, 1], [0, 2, 1, 10, 3, 2, 2, 1, 0, 1], [0, 4, 0, 1, 3, 2, 1, 1, 4, 1], [0, 1, 2, 3, 3, 2, 0, 1, 3, 1], [0, 4, 0, 1, 3, 2, 2, 1, 3, 1], [0, 2, 1, 5, 3, 2, 2, 1, 2, 1], [0, 1, 2, 5, 3, 2, 0, 1, 4, 1], [0, 2, 1, 9, 3, 2, 1, 1, 4, 1], [0, 4, 1, 5, 3, 2, 2, 1, 4, 1], [0, 1, 0, 5, 3, 2, 2, 1, 4, 1], [0, 4, 0, 5, 3, 2, 0, 0, 3, 1], [0, 4, 0, 7, 3, 2, 1, 1, 4, 1], [0, 1, 0, 9, 3, 2, 1, 1, 4, 1], [0, 3, 1, 1, 3, 2, 1, 0, 4, 1], [0, 4, 1, 7, 3, 2, 1, 1, 2, 1], [0, 4, 1, 4, 3, 2, 1, 0, 2, 1], [0, 2, 1, 1, 3, 2, 1, 1, 2, 1], [0, 4, 1, 0, 3, 2, 0, 1, 4, 1], [0, 1, 0, 1, 3, 2, 2, 0, 2, 1], [0, 2, 1, 7, 3, 2, 1, 0, 4, 1], [0, 1, 0, 5, 3, 2, 1, 1, 4, 1], [0, 4, 0, 7, 3, 2, 2, 0, 2, 1], [0, 4

## Full Scratch KNN model: For Breast_Cancer dataset

In [34]:
# PLEASE REFER: Details regarding Hayes Roth Dataset and how it is used here is in 'Doc file'

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row2)-2):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

def get_neighbors(X_train, test_row, k):
    distance = list()
    data = []
    for i in X_train:
        dist = euclidean_distance(test_row, i)
        distance.append(dist)
        data.append(i)
        
    distance = np.array(distance)
    data = np.array(data)
    # Find the index of min distance and sorting accordingly
    index_dist = distance.argsort()
    # We arrange our data according to index
    data = data[index_dist]
    # slicing k neighbors from data[]
    neighbors = data[:k]
    return neighbors

def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    classes = []
    for i in neighbors:
        classes.append(i[-1])
    prediction = max(set(classes), key=classes.count)
    return prediction

# Mapping for breast cancer dataset classes
class_mapping = {0: 'no-recurrence-events', 1: 'recurrence-events'}

# Example new observation row (converted to integers)
new_observation_row = [1, 2, 1, 2, 1, 1, 2, 1, 0]

# Large k-value(optional) -> replace it with 
large_k = int(sqrt(len(dataset)))
if large_k % 2 == 1: 
    large_k += 1

prediction = predict_classification(dataset, new_observation_row, 3)

print('************************ FINAL PREDICTION OUTPUT ************************')
print('New Observation:', new_observation_row)
predicted_class = class_mapping[prediction]
print('Predicted class:', predicted_class)

************************ FINAL PREDICTION OUTPUT ************************
New Observation: [1, 2, 1, 2, 1, 1, 2, 1, 0]
Predicted class: no-recurrence-events


## Measing 10 Fold Cross Validation Average accuracy score

In [35]:
n_folds = 10
def cross_validation_split(dataset, n_folds):
    dataset_split = []
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def evaluate_algorithm(dataset, n_folds, k):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None  # remove the class label for testing
        predictions = []
        for row in test_set:
            output = predict_classification(train_set, row, k)
            predictions.append(output)
        actual = [row[-1] for row in fold]
        correct = sum(1 for i in range(len(actual)) if actual[i] == predictions[i])
        accuracy = (correct / float(len(actual))) * 100.0
        scores.append(accuracy)
    return scores

random.seed(20)
k = 3
scores = evaluate_algorithm(dataset, n_folds, k)

print('************************ CROSS-VALIDATION RESULTS ************************')
print('Accuracy Scores:', scores)
print('Average Accuracy:', np.mean(scores))


************************ CROSS-VALIDATION RESULTS ************************
Accuracy Scores: [78.57142857142857, 78.57142857142857, 75.0, 64.28571428571429, 82.14285714285714, 82.14285714285714, 57.14285714285714, 78.57142857142857, 75.0, 67.85714285714286]
Average Accuracy: 73.92857142857143
