In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler


In [3]:
# Simulate loading the dataset using the provided snippet
data = pd.read_csv('dataset/ov4-breast-cancer.csv')


In [None]:
# Display the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())


In [5]:
# Replace missing values denoted by '?' with NaN
data.replace('?', np.nan, inplace=True)

# Convert columns to numeric, excluding 'id' and 'classes'
cols = data.columns.drop(['id', 'classes'])
data[cols] = data[cols].apply(pd.to_numeric)

# Check for missing values again
print(data.isnull().sum())


id                         0
clump_thickness            0
unif_cell_size             0
unif_cell_shape            0
marg_adhesion              0
single_epith_cell_size     0
bare_nuclei               16
bland_chrom                0
norm_nucleoli              0
mitoses                    0
classes                    0
dtype: int64


In [6]:
# For simplicity, we'll drop rows with missing values
data.dropna(inplace=True)

# Confirm that there are no missing values now
print(data.isnull().sum())


id                        0
clump_thickness           0
unif_cell_size            0
unif_cell_shape           0
marg_adhesion             0
single_epith_cell_size    0
bare_nuclei               0
bland_chrom               0
norm_nucleoli             0
mitoses                   0
classes                   0
dtype: int64


In [7]:
# Define features and target variable
X = data.drop(['id', 'classes'], axis=1)
y = data['classes']

# Convert target variable to binary if not already (assuming 0: benign, 1: malignant)
y = y.astype(int)

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)


In [9]:
# Calculate the initial k value
n = len(y_train)
initial_k = int(np.sqrt(n))
if initial_k % 2 == 0:
    initial_k += 1  # Ensure k is odd to avoid ties

print(f'Initial k value based on sqrt(n): {initial_k}')


Initial k value based on sqrt(n): 23


In [10]:
# Initialize the classifier with initial k
knn = KNeighborsClassifier(n_neighbors=initial_k)

# Train the model
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[78  1]
 [ 5 53]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        79
           1       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137



In [12]:
# Range of k values to try
k_values = [initial_k - 4, initial_k - 2, initial_k, initial_k + 2, initial_k + 4]

# Dictionary to store performance metrics
performance = {}

for k in k_values:
    if k > 0:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_scaled, y_train)
        y_pred = knn.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        performance[k] = {'accuracy': acc, 'confusion_matrix': cm}
        print(f'\nFor k = {k}:')
        print('Accuracy:', acc)
        print('Confusion Matrix:')
        print(cm)



For k = 19:
Accuracy: 0.9562043795620438
Confusion Matrix:
[[78  1]
 [ 5 53]]

For k = 21:
Accuracy: 0.9562043795620438
Confusion Matrix:
[[78  1]
 [ 5 53]]

For k = 23:
Accuracy: 0.9562043795620438
Confusion Matrix:
[[78  1]
 [ 5 53]]

For k = 25:
Accuracy: 0.948905109489051
Confusion Matrix:
[[78  1]
 [ 6 52]]

For k = 27:
Accuracy: 0.9562043795620438
Confusion Matrix:
[[78  1]
 [ 5 53]]


Since the accuracy is so similar the only one k value i would not prefer using is 25 since it has slightly more false negatives which is detremental when dealing with serious issues as cancer.