In [1]:
# Import necessary libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('../../Input/diabetes_clean_with_distribution.csv')

In [3]:
# Data Preprocessing
# Split the Data into features and Target
X = data.drop('Outcome', axis=1)
y = data.Outcome

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [4]:
# Standardize the features (important for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Define the range of k values to search
param_grid = {'n_neighbors': range(1, 21)}  # You can adjust the range as needed

In [6]:
# Initialize the KNN classifier
k = 3  # You can adjust the number of neighbors (k) as needed
model = KNeighborsClassifier(n_neighbors=k)

# Create a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5)  # Use cross-validation with 5 folds

# Fit the GridSearchCV to your data
grid_search.fit(X_train, y_train)

# Fit the classifier to the training data
model.fit(X_train, y_train)

In [7]:
# Get the best parameter (k value)
best_k = grid_search.best_params_['n_neighbors']

# Create a new KNN classifier with the best k value
best_model = KNeighborsClassifier(n_neighbors=best_k)

# Fit the best classifier to the training data
best_model.fit(X_train, y_train)

In [8]:
# Make predictions
# Make predictions on the test data
y_pred = model.predict(X_test)

In [9]:
# Evaluate the model
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}%")
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)

Accuracy: 72.08%
Confusion Matrix:
[[78 19]
 [24 33]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.80      0.78        97
           1       0.63      0.58      0.61        57

    accuracy                           0.72       154
   macro avg       0.70      0.69      0.69       154
weighted avg       0.72      0.72      0.72       154



In [10]:
# Evaluate the best model on the validation data
y_pred = best_model.predict(X_test)

In [11]:
# Evaluate the best model with the best k value
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Best k: {best_k}")
print(f"Accuracy with best k: {accuracy}%")
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(classification_rep)

Best k: 15
Accuracy with best k: 0.8051948051948052
Confusion Matrix:
[[87 10]
 [20 37]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        97
           1       0.79      0.65      0.71        57

    accuracy                           0.81       154
   macro avg       0.80      0.77      0.78       154
weighted avg       0.80      0.81      0.80       154



In [12]:
# Predict the condition for individual person
# 1,115,70,30,96,34.6,0.529,32,1
individual = pd.DataFrame({
    'Pregnancies': [1],
    'Glucose': [115],
    'BloodPressure': [70],
    'SkinThickness': [30],
    'Insulin': [96],
    'BMI': [34.6],
    'DiabetesPedigreeFunction': [0.529],
    'Age': [32]
})

# Handle missing values and scaling
# individual = preprocess_individual_data(individual)

individual_report = best_model.predict(individual)
if individual_report == 0:
    print('congrats... You\'re allowed to eat sweets!')
else:
    print('Time to take your diet into consideration, Champ!')

Time to take your diet into consideration, Champ!




### Conclusion: Fine tuned model performs well with 80.51% accuracy over base model with 72.08% accuracy