In [1]:
# Load the data
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('../../Input/diabetes_clean_with_distribution.csv')

In [3]:
# Data Preprocessing
# Split the Data into features and Target
X = data.drop('Outcome', axis=1)
y = data.Outcome

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [4]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'gamma': [0, 0.1, 0.5]
}

In [5]:
# Train the XGBoost classifier
model = XGBClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

In [6]:
# Tune the hyperparameters using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new XGBoost classifier with the best hyperparameters
best_model = XGBClassifier(**best_params)

# Train the best model on the training data
best_model.fit(X_train, y_train)

In [7]:
# Make predictions
# Make predictions on the test data
y_pred = model.predict(X_test)

In [8]:
# Evaluate the Model
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100: .2f}%')

Accuracy:  74.03%


In [9]:
# Evaluate the best model on the validation data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f'Accuracy: {accuracy * 100: .2f}%')

Accuracy:  82.47%


In [10]:
# Generate a classification report
report = classification_report(y_test, y_pred)
report

'              precision    recall  f1-score   support\n\n           0       0.84      0.89      0.86        97\n           1       0.79      0.72      0.75        57\n\n    accuracy                           0.82       154\n   macro avg       0.82      0.80      0.81       154\nweighted avg       0.82      0.82      0.82       154\n'

In [11]:
# Predict the condition for individual person
# 1,115,70,30,96,34.6,0.529,32,1
individual = pd.DataFrame({
    'Pregnancies': [1],
    'Glucose': [115],
    'BloodPressure': [70],
    'SkinThickness': [30],
    'Insulin': [96],
    'BMI': [34.6],
    'DiabetesPedigreeFunction': [0.529],
    'Age': [32]
})

# Handle missing values and scaling
# individual = preprocess_individual_data(individual)

individual_report = best_model.predict(individual)
if individual_report == 0:
    print('congrats... You\'re allowed to eat sweets!')
else:
    print('Time to take your diet into consideration, Champ!')

Time to take your diet into consideration, Champ!


### Conclusion: Fine tuned model performs well with 82.47% accuracy over base model with 74.03% accuracy