# KNN for PIMA DIABETES DATASET

## Libraries

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Importing Dataset

In [23]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "diabetes.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "mragpavank/diabetes",
  file_path,
)

df.head()

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'diabetes' dataset.


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
# df.columns
print((df["Glucose"] == 0).sum())
print((df["BloodPressure"] == 0).sum())
print((df["SkinThickness"] == 0).sum())
print((df["Insulin"] == 0).sum())
print((df["BMI"] == 0).sum())
print((df["DiabetesPedigreeFunction"] == 0).sum())
print((df["Age"] == 0).sum())

5
35
227
374
11
0
0


## Cleaning and Scaling the Dataset -

Some attributes such as glucose, BP, BMI, etc cannot practically have 0 values, hence are **imputed** ( replaced with statistically reasonable values ).


In [25]:
#we take the median of non-zero values for the columns with lesser zero values.

Glucose_median = df["Glucose"].replace(0, pd.NA).median()   # replaces with NA temporarily (only until it counts)
df["Glucose"] = df["Glucose"].replace(0, Glucose_median)

BloodPressure_median = df["BloodPressure"].replace(0, pd.NA).median()
df["BloodPressure"] = df["BloodPressure"].replace(0, BloodPressure_median)

bmi_median = df["BMI"].replace(0, pd.NA).median()
df["BMI"] = df["BMI"].replace(0, bmi_median)

In [26]:
# Columns with many zero values are applied with KNN Imputer - where rows with similar values in other columns are found and averaged.

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

knn_pipeline = Pipeline([
    ("scaler", StandardScaler()), # scaled before imputing so that all other columns have equal importance while comparing
    ("imputer", KNNImputer(n_neighbors=5))
])

df["SkinThickness"] = knn_pipeline.fit_transform(df[["SkinThickness"]])
df["Insulin"] = knn_pipeline.fit_transform(df[["Insulin"]])

## Train-Val-Test Split

In [27]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Outcome"]) # weights
y = df["Outcome"] # values


X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42
)

X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
335,0,165,76,1.409094,1.521248,47.9,0.259,26
467,0,97,64,0.969998,0.175399,36.8,0.600,25
51,1,101,50,-0.347291,-0.380306,24.2,0.526,26
131,9,122,56,-1.288212,-0.692891,33.3,1.114,33
649,0,107,60,0.279989,-0.692891,26.4,0.133,23
...,...,...,...,...,...,...,...,...
703,2,129,72,-1.288212,-0.692891,38.5,0.304,41
412,1,143,84,0.154533,1.998807,42.4,1.076,22
38,2,90,68,1.346366,-0.692891,38.2,0.503,27
386,5,116,74,0.530902,-0.692891,32.3,0.660,35


## Model Pipeline

In [36]:
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(
        n_neighbors=9,
        weights="uniform",
        metric="euclidean"
    )
)

knn_pipeline.fit(X_train, y_train)
y_val_pred = knn_pipeline.predict(X_val)

## Evaluation Metrics

In [37]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_val_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_val_pred))
print("\nClassification Report:\n", classification_report(y_test, y_val_pred))

Accuracy: 0.5714285714285714

Confusion Matrix:
 [[78 21]
 [45 10]]

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.79      0.70        99
           1       0.32      0.18      0.23        55

    accuracy                           0.57       154
   macro avg       0.48      0.48      0.47       154
weighted avg       0.52      0.57      0.53       154



## Hyperparameter Tuning

In [31]:
from sklearn.model_selection import GridSearchCV

# pipeline = knn_pipeline([
#     ("scaler", StandardScaler()),
#     ("knn", KNeighborsClassifier())
# ])

param_grid = {
    "kneighborsclassifier__n_neighbors": [3,5,7,9,11,15,21],
    "kneighborsclassifier__weights": ["uniform", "distance"],
    "kneighborsclassifier__metric": ["euclidean", "manhattan", "minkowski"]
}

grid_knn = GridSearchCV(
    knn_pipeline,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_knn.fit(X_train, y_train)

print("Best parameters:", grid_knn.best_params_)
print("Best CV accuracy:", grid_knn.best_score_)

Best parameters: {'kneighborsclassifier__metric': 'euclidean', 'kneighborsclassifier__n_neighbors': 9, 'kneighborsclassifier__weights': 'uniform'}
Best CV accuracy: 0.7608695652173912
