In [1]:
!pip install pandas numpy scikit-learn



In [21]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Load the diabetes dataset
diabetes = pd.read_csv('diabetes.csv')

# Separate features (X) and target variable (y)
X = diabetes.drop('Outcome', axis=1)   # all columns except 'Outcome'
y = diabetes['Outcome']                # target column (0 = no diabetes, 1 = diabetes)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (KNN is distance-based, so scaling is important)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the KNN classifier with k=5 neighbors
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model using the training data
knn.fit(X_train, y_train)

# Predict the outcomes on the test data
y_pred = knn.predict(X_test)

# Evaluate the model using performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred), 4))
print("Recall:", round(recall_score(y_test, y_pred), 4))
print("Error Rate:", round(1 - accuracy_score(y_test, y_pred), 4))


Confusion Matrix:
 [[79 20]
 [27 28]]
Accuracy: 0.6948
Precision: 0.5833
Recall: 0.5091
Error Rate: 0.3052
