##K Nearest Neighbours Classifier


In [None]:
#Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.reprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metric import confusin_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

Let's load trhe dataset, we will use the URL here as well, we will also be naming the different column titles

In [None]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
column_names = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
df = pd.read_csv(url, header=None,names=column_names)

#Let's view the first view rows of the dtaset so that we can have a visual feel for the dataset
print("First few rows of the dataset: ")
print(df.head())

#Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)


Exploratory Data Analysis so that we can better undertand our dataset

In [None]:
#Let's plot the outcome distribution
plt.figure(figsize=(10,6))
sns.countplot(x='Outcome', data = df)
plt.title('Outcome Count')
plt.show()

#We display a histogram for each feature
df.hist(bins=30, figsize=(20,15))
plt.show()

#Correlation matrix
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True,fmt='.2f',cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

Data Balancing and Scaling

In [None]:
#Separate features and target variable
X= df.drop('Outcome' , axis=1)
y = df['Outcome']

#Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

#Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X_resampled,yresampled, test_size=0.2, random_state=42)

#Scale the feature
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Model Building and Training

We will build the K Nearest Neighbours (KNN) model

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

Model Evaluation

In [None]:
#Predict diabetes on the test data
y_pred = knn.predict(X_test)
y_pred_proba=knn.predict_proba(X_test)[:, 1]

#Print Confusion Matrix
cm= confusion_matrix(y_test,y_pred)
print("\nConfusion Matrix:\n", cm)

#Print Classification Report
cr = classifcation_report(y_test,y_pred)
print("\nClassification Report: \n", cr)

#ROC Curve
fpr, tpr, thresholds= roc_curve(y_test, y_pred_proba)
plt.figure(figsize(10,6))
plt.plot(fpr,tpr, marker='*')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

#AUC Score
auc = roc_auc)score(y_test,ypred_proba)
print(f"\nAUC Score: {auc}")

Hyperparameter tuning with Cross-Validation

In [None]:
#Define the parameter grid
param_grid={'n_neighbours:[3,5,7,9,11],
            'weights':['uniform','distance']
           }
#Initialize GridSearchCV
grid_search= GridSearchCV(KNeighborsClassifier(),
                          param_grid, cv5, scoring= 'roc_auc')

#Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

#Get the optimal model from Grid search
best_knn=grid_search.best_estimator_

#Print best parameters
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

#Evaluate the best model
y_pred_best=best_knn.predict(X_test)
y_red_proba_best=best_knn.predict_proba(X_test)[:,1]

#Confusion Matrix for the best model
best_cm = confusion_matrix(y_test,y_pred_best)
print("Confusion Matrix for the best model:\n", best_cm)

#Classification Report for the best model
best_cr = classification_report(y_test,y_pred_best)
print("Classification Report for the best model:\n", best_cr)

#ROC Curve for the best model
fpr_best, tpr_best, thresholds_best= roc_curve(y_test,y_pred_proba_best)
plt.figure(figsize=(10,6))
plt.plot(fpr_best,tpr_best,marker='*')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for the best Model')

#AUC Score for the best model
best_auc = roc_auc_score(y_test, y_pred_proba_best)
print(f"AUC Score for the best model: {best_auc}")

      