In [29]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px


In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/Bash-UK/SPPU_BE_LP3_Lab/main/ML/lab5_diabetesKNN/diabetes.csv")

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    768 non-null    int64  
 1   Glucose        768 non-null    int64  
 2   BloodPressure  768 non-null    int64  
 3   SkinThickness  768 non-null    int64  
 4   Insulin        768 non-null    int64  
 5   BMI            768 non-null    float64
 6   Pedigree       768 non-null    float64
 7   Age            768 non-null    int64  
 8   Outcome        768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
x = data.drop("Outcome", axis=1)
y = data["Outcome"]

In [9]:
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(x)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(scaled_values, y, test_size=0.2)

In [15]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49]
accuracy_values = {value:None for value in k_values}

In [16]:
for i in tqdm(range(len(k_values))):
    model = KNeighborsClassifier(n_neighbors=k_values[i])
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test,y_pred)
    accuracy_values[k_values[i]]=accuracy

  0%|          | 0/25 [00:00<?, ?it/s]

In [17]:
accuracy_scores ={k:v for k,v in sorted(accuracy_values.items(),key = lambda item:item[1],reverse=True)}

In [18]:
accuracy_scores

{5: 0.7272727272727273,
 7: 0.7272727272727273,
 9: 0.7142857142857143,
 11: 0.7077922077922078,
 3: 0.7012987012987013,
 1: 0.6948051948051948,
 13: 0.6948051948051948,
 17: 0.6948051948051948,
 35: 0.6883116883116883,
 29: 0.6818181818181818,
 15: 0.6753246753246753,
 33: 0.6753246753246753,
 43: 0.6753246753246753,
 19: 0.6688311688311688,
 27: 0.6688311688311688,
 31: 0.6688311688311688,
 37: 0.6688311688311688,
 39: 0.6688311688311688,
 41: 0.6688311688311688,
 49: 0.6688311688311688,
 21: 0.6623376623376623,
 25: 0.6623376623376623,
 45: 0.6623376623376623,
 23: 0.6558441558441559,
 47: 0.6558441558441559}

In [19]:
optimal_k = next(iter(accuracy_scores))
optimal_k

5

In [20]:
knn_model = KNeighborsClassifier(n_neighbors=optimal_k)

In [22]:
knn_model.fit(x_train,y_train)

In [23]:
y_pred = knn_model.predict(x_test)

In [24]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.91      0.79        89
           1       0.79      0.48      0.60        65

    accuracy                           0.73       154
   macro avg       0.75      0.69      0.70       154
weighted avg       0.74      0.73      0.71       154



In [28]:
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[81,  8],
       [34, 31]], dtype=int64)

In [31]:
confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Confusion Matrix:")
print(confusion)
print("Accuracy: %.2f" % accuracy)
print("Error Rate: %.2f" % error_rate)
print("Precision: %.2f" % precision)
print("Recall: %.2f" % recall)

Confusion Matrix:
[[81  8]
 [34 31]]
Accuracy: 0.73
Error Rate: 0.27
Precision: 0.79
Recall: 0.48
