In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [4]:
df = pd.read_csv('./dataset/processed_dataset.csv')

In [5]:
# Create a copy of the DataFrame to avoid modifying the original
df_encoded = df.copy()

df_encoded.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


### Prepare dataset for model building

In [6]:
x = df_encoded.drop(columns=['bmi', 'HbA1c_level', 'diabetes'])
y = df_encoded['diabetes']

# Verify the remaining features in X
print("Features for model building:")
print(x.columns)

Features for model building:
Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'blood_glucose_level'],
      dtype='object')


### Building model

In [7]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier with chosen k value
k = 5  # You can experiment with different k values
knn_model = KNeighborsClassifier(n_neighbors=k)

# Train the model
knn_model.fit(x_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(x_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9399375975039002
Precision: 0.8201357466063348
Recall: 0.421266705403835
F1 Score: 0.5566218809980806
