In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [3]:
# Load dataset
dataset = pd.read_csv("diabetes.csv")


In [12]:
# Step 1: Replace zeroes with column mean in specific columns (Data Cleaning)
#In this dataset, some features like Glucose, BloodPressure, etc., have zeros, 
# which are not realistic values for medical measurements (e.g., you can’t have 0 blood pressure). 
#     These are treated as missing values and replaced with the mean of the column.

# zero_not_accepted: List of columns where 0 is invalid.

# replace(0, np.NaN): Convert zeros to missing values.

# mean(skipna=True): Calculate average, ignoring NaNs.

# replace(np.NaN, mean): Fill missing values with the mean.

zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.nan)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.nan, mean)

In [13]:
# Step 2: Split features and target
X = dataset.iloc[:, 0:8]   # Features (first 8 columns)
y = dataset.iloc[:, 8]     # Target column (Outcome)


In [7]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [14]:
# Step 4: Feature Scaling
# Feature scaling transforms the values of numeric features into the same scale, 
# so that no feature dominates the others due to its larger values.

# This is especially important for algorithms like KNN, which are based on distance.
# Imagine if you don’t scale:
# Glucose and Age will have larger influence on the distance than DiabetesPedigreeFunction, simply because their values are bigger.

scaler = StandardScaler() # Creates a scaler object
X_train = scaler.fit_transform(X_train) 

# fit_transform does two things:
# fit(): Calculates the mean and standard deviation from X_train only.
# transform(): Applies the transformation (standardization) to X_train.

# ❗ Why from training data only?
# To prevent data leakage — we don’t want to peek at the test data during training.

X_test = scaler.transform(X_test)
#transform() only — it uses the same mean and standard deviation from training data to scale the test data.
#This ensures that both train and test sets are on the same scale.



In [15]:
# Step 5: KNN Model Training
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
# Step 6: Make Predictions
y_pred = knn.predict(X_test)

In [17]:
# Step 7: Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7597402597402597

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.81      0.82       107
           1       0.60      0.64      0.62        47

    accuracy                           0.76       154
   macro avg       0.72      0.73      0.72       154
weighted avg       0.76      0.76      0.76       154


Confusion Matrix:
 [[87 20]
 [17 30]]
