In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import load_diabetes

In [2]:
# The dataset can be downloaded from: https://www.kaggle.com/uciml/pima-indians-diabetes-database
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = pd.read_csv(url, names=names)

In [3]:
# Split dataset into features (X) and target (y)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Feature scaling (important for KNN as it's distance-based)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create KNN classifier
# You can experiment with different n_neighbors values
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

In [5]:
# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)


In [6]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6948051948051948

Confusion Matrix:
[[79 20]
 [27 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.80      0.77        99
           1       0.58      0.51      0.54        55

    accuracy                           0.69       154
   macro avg       0.66      0.65      0.66       154
weighted avg       0.69      0.69      0.69       154



In [7]:
# to make new predictions (example)
def predict_diabetes(new_data):
    """
    Predict diabetes for new data points
    new_data: Should be a 2D array with the same features as training data
    """
    # Scale the new data using the same scaler
    new_data_scaled = scaler.transform(new_data)
    prediction = knn.predict(new_data_scaled)
    proba = knn.predict_proba(new_data_scaled)
    return prediction, proba