#KNN Classifier for Diabetes

This is a notebook following a KNN classifier to classify if an individual has diabetes based on their medical history

In [25]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import f_classif #To obtain the best feature
from sklearn.utils import resample #To balance the dataset

In [26]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
#Dataset obtained from data.world
df = pd.read_excel('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/X4i8vXLw81g4wEH473zIFA/Diabetes-Classification.xlsx')

In [28]:
df.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Gender,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes,Unnamed: 16,Unnamed: 17
0,1,193,77,49,3.9,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes,6.0,6.0
1,2,146,79,41,3.6,19,female,60,135,26.4,108,58,33,40,0.83,No diabetes,,
2,3,217,75,54,4.0,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes,,
3,4,226,97,70,3.2,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes,,
4,5,164,91,67,2.4,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes,,


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Patient number   390 non-null    int64  
 1   Cholesterol      390 non-null    int64  
 2   Glucose          390 non-null    int64  
 3   HDL Chol         390 non-null    int64  
 4   Chol/HDL ratio   390 non-null    float64
 5   Age              390 non-null    int64  
 6   Gender           390 non-null    object 
 7   Height           390 non-null    int64  
 8   Weight           390 non-null    int64  
 9   BMI              390 non-null    float64
 10  Systolic BP      390 non-null    int64  
 11  Diastolic BP     390 non-null    int64  
 12  waist            390 non-null    int64  
 13  hip              390 non-null    int64  
 14  Waist/hip ratio  390 non-null    float64
 15  Diabetes         390 non-null    object 
 16  Unnamed: 16      1 non-null      float64
 17  Unnamed: 17     

In [30]:
df.describe()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Unnamed: 16,Unnamed: 17
count,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,1.0,1.0
mean,195.5,207.230769,107.338462,50.266667,4.524615,46.774359,65.951282,177.407692,28.775641,137.133333,83.289744,37.869231,42.992308,0.881385,6.0,6.0
std,112.727548,44.666005,53.798188,17.279069,1.736634,16.435911,3.918867,40.407824,6.600915,22.859528,13.498192,5.760947,5.664342,0.073212,,
min,1.0,78.0,48.0,12.0,1.5,19.0,52.0,99.0,15.2,90.0,48.0,26.0,30.0,0.68,6.0,6.0
25%,98.25,179.0,81.0,38.0,3.2,34.0,63.0,150.25,24.1,122.0,75.0,33.0,39.0,0.83,6.0,6.0
50%,195.5,203.0,90.0,46.0,4.2,44.5,66.0,173.0,27.8,136.0,82.0,37.0,42.0,0.88,6.0,6.0
75%,292.75,229.0,107.75,59.0,5.4,60.0,69.0,200.0,32.275,148.0,90.0,41.0,46.0,0.93,6.0,6.0
max,390.0,443.0,385.0,120.0,19.3,92.0,76.0,325.0,55.8,250.0,124.0,56.0,64.0,1.14,6.0,6.0


In [31]:
#Check NA values
df.isna().sum()

Unnamed: 0,0
Patient number,0
Cholesterol,0
Glucose,0
HDL Chol,0
Chol/HDL ratio,0
Age,0
Gender,0
Height,0
Weight,0
BMI,0


In [32]:
#Check duplicates
df.duplicated().sum()

0

In [33]:
#Drop excess columns
df.drop(columns = ["Unnamed: 16", "Unnamed: 17"], inplace = True)

In [34]:
#Obtain distribution of the classes to predict in the dataset
frequency_table = df['Diabetes'].value_counts()
props = frequency_table.apply(lambda x:x/len(df['Diabetes']))
print(props)

Diabetes
No diabetes    0.846154
Diabetes       0.153846
Name: count, dtype: float64


In [35]:
#Feature Selection
df_reduced = df[['Diabetes','Cholesterol','Glucose','BMI','Waist/hip ratio','HDL Chol',"Chol/HDL ratio",'Systolic BP','Diastolic BP','Age']]

numerical_columns = df_reduced.iloc[:,1:10]

#Scaling values
scaler = StandardScaler()
df_standardized = scaler.fit_transform(numerical_columns)

#Converting standardized array to dataframe
df_standardized = pd.DataFrame(df_standardized, columns = numerical_columns.columns)

In [36]:
df_standardized.describe()

Unnamed: 0,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Age
count,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0
mean,7.287618000000001e-17,-1.457524e-16,2.2773810000000003e-17,-6.741046e-16,4.3270230000000006e-17,-6.376666000000001e-17,2.915047e-16,-3.006142e-16,-1.457524e-16
std,1.001285,1.001285,1.001285,1.001285,1.001285,1.001285,1.001285,1.001285,1.001285
min,-2.896986,-1.104399,-2.059272,-2.754229,-2.21747,-1.743891,-2.064517,-2.617764,-1.692029
25%,-0.6328534,-0.4902078,-0.7092421,-0.7027598,-0.7108267,-0.7637287,-0.6628646,-0.6149262,-0.7782208
50%,-0.09484179,-0.3227011,-0.1479938,-0.01893664,-0.2472441,-0.1871623,-0.04964184,-0.0956721,-0.1385552
75%,0.4880041,0.007659498,0.5308134,0.6648866,0.5060777,0.5047173,0.4759777,0.4977612,0.8057131
max,5.285274,5.167799,4.099291,3.536944,4.040895,8.51899,4.943744,3.019853,2.75517


In [37]:
#Adding the target column in the dataset
df_standard = pd.concat([df_reduced['Diabetes'], df_standardized], axis = 1)

In [38]:
df_standard.columns

Index(['Diabetes', 'Cholesterol', 'Glucose', 'BMI', 'Waist/hip ratio',
       'HDL Chol', 'Chol/HDL ratio', 'Systolic BP', 'Diastolic BP', 'Age'],
      dtype='object')

In [39]:
#Splitting the dataset
x = df_standard.iloc[:,1:10]
y = df_standard['Diabetes']

xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [40]:
#Converting Diabetes labels into numerical values
label_encoder = LabelEncoder()

ytrain_encode = label_encoder.fit_transform(ytrain)
ytest_encode = label_encoder.fit_transform(ytest)

In [41]:
#Creating a KNN classifier with n_neighbors as 6
knn = KNeighborsClassifier(n_neighbors = 6)

knn.fit(xtrain, ytrain_encode)

#Accuracy
ypred = knn.predict(xtest)
accuracy = accuracy_score(ytest_encode, ypred)
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 91.03%


In [42]:
#Hyperparameter tuning using Grid Search CV
knn = KNeighborsClassifier()
param_grid = {'n_neighbors':range(1,12)} #Defining the parameter to tune with range of values

#Perform grid search with cross validation
grid_search = GridSearchCV(knn, param_grid, cv = 10)
grid_search.fit(xtrain, ytrain_encode)

#Best parameters and best score
print('Best parameter: ',grid_search.best_params_)
print(f"Best accuracy_score: ,{grid_search.best_score_:.3f}")

#Full results
results = grid_search.cv_results_
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
  print(f"Mean accuracy: {mean_score:.3f} (std: {std_score:.3f}) with: {params}")

Best parameter:  {'n_neighbors': 3}
Best accuracy_score: ,0.920
Mean accuracy: 0.872 (std: 0.047) with: {'n_neighbors': 1}
Mean accuracy: 0.830 (std: 0.060) with: {'n_neighbors': 2}
Mean accuracy: 0.920 (std: 0.044) with: {'n_neighbors': 3}
Mean accuracy: 0.901 (std: 0.044) with: {'n_neighbors': 4}
Mean accuracy: 0.913 (std: 0.046) with: {'n_neighbors': 5}
Mean accuracy: 0.913 (std: 0.038) with: {'n_neighbors': 6}
Mean accuracy: 0.913 (std: 0.038) with: {'n_neighbors': 7}
Mean accuracy: 0.910 (std: 0.034) with: {'n_neighbors': 8}
Mean accuracy: 0.917 (std: 0.038) with: {'n_neighbors': 9}
Mean accuracy: 0.917 (std: 0.038) with: {'n_neighbors': 10}
Mean accuracy: 0.913 (std: 0.038) with: {'n_neighbors': 11}


In [43]:
#Anova for feature selection
fs_score, fs_p_value = f_classif(x, y)

# Combine scores with feature names
fs_scores = pd.DataFrame({'Feature': x.columns, 'F-Score': fs_score, 'P-Value': fs_p_value})
fs_scores = fs_scores.sort_values(by='F-Score', ascending=False)

print(fs_scores)

           Feature     F-Score       P-Value
1          Glucose  350.809177  3.205119e-56
8              Age   38.936985  1.146990e-09
5   Chol/HDL ratio   31.242678  4.298115e-08
0      Cholesterol   16.893380  4.827353e-05
6      Systolic BP   15.931795  7.853024e-05
3  Waist/hip ratio   12.348083  4.935038e-04
2              BMI    8.365055  4.040512e-03
4         HDL Chol    5.973355  1.496812e-02
7     Diastolic BP    0.947292  3.310160e-01


In [44]:
# Converting Diabetes column into binary (0 for No Diabetes and 1 for Diabetes)
df_standard['Diabetes'] = np.where(df_standard['Diabetes'] == 'Diabetes', 1, 0)
df_standard

Unnamed: 0,Diabetes,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Age
0,0,-0.319013,-0.564655,-0.951944,-0.565995,-0.073401,-0.360132,-0.838071,-0.985822,-1.692029
1,0,-1.372619,-0.527432,-0.360358,-0.702760,-0.536983,-0.533102,-1.276087,-1.875972,-1.692029
2,0,0.218998,-0.601879,0.079539,0.117828,0.216339,-0.302476,-1.188484,-0.837464,-1.631108
3,0,0.420753,-0.192418,-1.391841,-1.249818,1.143504,-0.763729,-0.662865,-1.430897,-1.631108
4,0,-0.969111,-0.304089,-1.300828,-0.839524,0.969660,-1.224982,-0.662865,0.201045,-1.631108
...,...,...,...,...,...,...,...,...,...,...
385,0,0.443170,-0.043523,-0.542385,-0.018937,-0.363140,0.389404,0.563581,0.497761,2.206885
386,1,0.420753,3.194941,1.323387,-0.429231,0.100443,-0.129506,0.300771,0.349403,2.267806
387,0,2.102039,-0.322701,-1.073295,-1.660112,3.924999,-1.109668,3.542092,0.497761,2.572409
388,1,0.555256,1.426814,-0.724411,0.528122,3.693208,-1.455608,1.439613,-0.095672,2.694250


In [45]:
# Number of rows for positive diabetes
positive_diabetes = df_standard[df_standard['Diabetes'] == 1].shape[0]
print('Number of rows for positive diabetes: ', positive_diabetes)

# Sample negative cases to match positive cases
negative_diabetes = df_standard[df_standard['Diabetes'] == 0]
negative_diabetes_downsampled = resample(negative_diabetes, replace=False, n_samples=positive_diabetes, random_state=42)

# Put positive and negative diabetes case into one dataframe
balanced = pd.concat([negative_diabetes_downsampled, df_standard[df_standard['Diabetes'] == 1]])
balanced.sample(5)

Number of rows for positive diabetes:  60


Unnamed: 0,Diabetes,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Age
182,0,1.048433,-0.434372,-0.345189,-0.839524,0.100443,0.216434,-0.53146,-0.985822,-0.229936
213,1,-0.767356,2.1899,0.837983,0.117828,-1.116461,0.620031,0.563581,1.091194,0.013746
58,0,-0.632853,-0.564655,-0.451371,1.211945,-0.015453,-0.533102,-0.662865,-1.13418,-1.082823
114,0,0.375918,-0.41576,0.231228,0.254593,-1.174409,1.715507,0.563581,1.165374,-0.65638
43,0,0.689758,-0.601879,0.671125,-0.429231,-0.826722,1.196597,-0.312452,-0.24403,-1.204665


In [46]:
balanced['Diabetes'].value_counts()

Unnamed: 0_level_0,count
Diabetes,Unnamed: 1_level_1
0,60
1,60


In [47]:
#Splitting the dataset
x = balanced[['Glucose']]
y = balanced['Diabetes']

xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [48]:
#Creating the classifier and predicting
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(xtrain, ytrain)
ypred = knn.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print(f'Accuracy: {accuracy: .2%}')

Accuracy:  95.83%


Conclusion: By choosing the feature with the highest f-score and the appropriate hyperparameter value (obtain using cross validation), we have a KNN Classifier that has 95% accuracy!