# DIABETES PREDICTION (Classification Problem)

### Analyzing Diagnostic Factors Using BOOSTING ALGORITHMS

In [None]:
#Libraries

import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib
from pickle import dump
import math
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

**STEP 1: PROBLEM STATEMENT & DATA COLLECTION**

***1.1 PROBLEM STATEMENT***

**Goal** -  predict based on diagnostic measures whether or not a patient has diabetes.

**6.2 SAVING THE CSV FILES**

In [None]:
#dfs_train = {
    #'X_train_with_outliers_sel': X_train_with_outliers_sel,
    #'X_train_without_outliers_sel': X_train_without_outliers_sel,
    #'X_train_with_outliers_norm_sel': X_train_with_outliers_norm_sel,
    #'X_train_without_outliers_norm_sel': X_train_without_outliers_norm_sel,
    #'X_train_with_outliers_minmax_sel': X_train_with_outliers_minmax_sel,
    #'X_train_without_outliers_minmax_sel': X_train_without_outliers_minmax_sel 
#}

#dfs_test = {
#    'X_test_with_outliers_sel': X_test_with_outliers_sel,
#    'X_test_without_outliers_sel': X_test_without_outliers_sel,
#    'X_test_with_outliers_norm_sel': X_test_with_outliers_norm_sel,
#    'X_test_without_outliers_norm_sel': X_test_without_outliers_norm_sel,
#    'X_test_with_outliers_minmax_sel': X_test_with_outliers_minmax_sel,
#    'X_test_without_outliers_minmax_sel': X_test_without_outliers_minmax_sel    
#}

#for name, df in dfs_train.items():
#    df.to_csv(f"../data/processed/{name}.csv", index=False)

#for name, df in dfs_test.items(): 
#    df.to_csv(f'../data/processed/{name}.csv', index=False)

In [None]:
# Load the processed datasets

# WITH outliers
X_train_with_outliers = pd.read_csv("../data/processed/X_train_with_outliers.csv")
X_test_with_outliers = pd.read_csv("../data/processed/X_test_with_outliers.csv")

# WITHOUT outliers
X_train_without_outliers = pd.read_csv("../data/processed/X_train_without_outliers.csv")
X_test_without_outliers = pd.read_csv("../data/processed/X_test_without_outliers.csv")

# TARGET VARIABLE
y_train = pd.read_csv("../data/processed/y_train.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")


## MACHINE LEARNING

 ## **BOOSTING FOR CLASSIFICATION**

In [13]:
# Train with the dataset WITH outliers
model_with_outliers = XGBClassifier(random_state=42)
model_with_outliers.fit(X_train_with_outliers, y_train)
y_pred_with_outliers = model_with_outliers.predict(X_test_with_outliers)
accuracy_with_outliers = accuracy_score(y_test, y_pred_with_outliers)
print(f"Accuracy WITH outliers: {accuracy_with_outliers}")

# Train with the dataset WITHOUT outliers
model_without_outliers = XGBClassifier(random_state=42)
model_without_outliers.fit(X_train_without_outliers, y_train)
y_pred_without_outliers = model_without_outliers.predict(X_test_without_outliers)
accuracy_without_outliers = accuracy_score(y_test, y_pred_without_outliers)
print(f"Accuracy WITHOUT outliers: {accuracy_without_outliers}")



Accuracy WITH outliers: 0.7662337662337663
Accuracy WITHOUT outliers: 0.7727272727272727


In [None]:
# Model evaluation WITH outliers
print("\nMetrics for the model WITH outliers:")
print(f"Accuracy: {accuracy_with_outliers}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_with_outliers))
print("Classification Report:")
print(classification_report(y_test, y_pred_with_outliers))

# Model evaluation WITHOUT outliers
print("\nMetrics for the model WITHOUT outliers:")
print(f"Accuracy: {accuracy_without_outliers}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_without_outliers))
print("Classification Report:")
print(classification_report(y_test, y_pred_without_outliers))



Metrics for the model WITH outliers:
Accuracy: 0.7662337662337663
Confusion Matrix:
[[77 19]
 [17 41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.80      0.81        96
           1       0.68      0.71      0.69        58

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154


Metrics for the model WITHOUT outliers:
Accuracy: 0.7727272727272727
Confusion Matrix:
[[76 20]
 [15 43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.81        96
           1       0.68      0.74      0.71        58

    accuracy                           0.77       154
   macro avg       0.76      0.77      0.76       154
weighted avg       0.78      0.77      0.77       154



#### DECISION:

Based on metrics:

The model WITHOUT outliers is preferable as it performs slightly better in terms of recall and F1-Score for class 1, which is important when we want to capture more positive examples (class 1) with fewer errors.

#### **OPTIMIZING BOOSTING ALGORITHM MODEL**

In [None]:
# Base model
model = XGBClassifier(random_state=42)

# Hyperparameters
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 1.0],
    'max_depth': [3, 5, 7],
    'colsample_bytree': [0.8, 1.0]
}

# Optimization with GridSearch
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train_without_outliers, y_train)


print(f"Best Hyperparameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 150, 'subsample': 1.0}


In [26]:
# Train the final model with the best hyperparameters

# Training the final model
best_model.fit(X_train_without_outliers, y_train)

# Predicting the final model
y_pred_final = best_model.predict(X_test_without_outliers)


In [23]:
# Accuracy
accuracy_final = accuracy_score(y_test, y_pred_final)
print(f"Final Model Accuracy: {accuracy_final}")

# Matriz de Confusão
print("\nFinal Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))

# Relatório de Classificação
print("\nFinal Model Classification Report:")
print(classification_report(y_test, y_pred_final))


Final Model Accuracy: 0.7662337662337663

Final Model Confusion Matrix:
[[89  7]
 [29 29]]

Final Model Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.93      0.83        96
           1       0.81      0.50      0.62        58

    accuracy                           0.77       154
   macro avg       0.78      0.71      0.72       154
weighted avg       0.77      0.77      0.75       154



#### **Saving Optimized Boosting Algorithms model**

In [27]:
# Saving the final model
best_model.save_model("xgb_final_model_without_outliers.json")
print("Final model saved successfully!")


Final model saved successfully!
