In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score

In [None]:
url = 'https://raw.githubusercontent.com/TahsinArafat/KNN-Implementation/refs/heads/main/data.csv'
df = pd.read_csv(url)

In [None]:
df = df.drop(['id','Unnamed: 32'], axis=1)
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])

In [None]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
errors = []
k_range = range(1,21)
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    errors.append(1 - knn.score(X_val,y_val))

In [None]:
plt.plot(k_range, errors)
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()

In [None]:
params = {
    'n_neighbors':[3,5,7,9,11],
    'weights':['uniform','distance'],
    'metric':['euclidean','manhattan','minkowski']
}

In [None]:
grid = GridSearchCV(KNeighborsClassifier(), params, cv=5)
grid.fit(X_train, y_train)

In [None]:
model = grid.best_estimator_

In [None]:
pred = model.predict(X_test)
prob = model.predict_proba(X_test)[:,1]

In [None]:
sns.heatmap(confusion_matrix(y_test,pred),annot=True,fmt='d')
plt.show()

In [None]:
fpr,tpr,_ = roc_curve(y_test,prob)
plt.plot(fpr,tpr,label=f'AUC={auc(fpr,tpr):.2f}')
plt.plot([0,1],[0,1],'--')
plt.legend()
plt.show()

In [None]:
metrics = {
    'Accuracy': accuracy_score(y_test,pred),
    'Precision': precision_score(y_test,pred),
    'Recall': recall_score(y_test,pred),
    'F1': f1_score(y_test,pred),
    'AUC': auc(fpr,tpr)
}
pd.Series(metrics).plot(kind='bar')
plt.show()

In [None]:
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_train)
knn_2d = KNeighborsClassifier(n_neighbors=model.n_neighbors)
knn_2d.fit(X_2d, y_train)

In [None]:
x_min,x_max = X_2d[:,0].min()-1, X_2d[:,0].max()+1
y_min,y_max = X_2d[:,1].min()-1, X_2d[:,1].max()+1
xx,yy = np.meshgrid(np.linspace(x_min,x_max,300), np.linspace(y_min,y_max,300))
Z = knn_2d.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
plt.contourf(xx,yy,Z,alpha=0.3)
plt.scatter(X_2d[:,0],X_2d[:,1],c=y_train,s=10)
plt.show()