In [2]:
# Ensure that XGBoost is installed on your PC:
# !pip install pandas scikit-learn imblearn xgboost

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 6.3 MB/s eta 0:00:24
    --------------------------------------- 2.1/150.0 MB 5.6 MB/s eta 0:00:27
    --------------------------------------- 2.6/150.0 MB 5.8 MB/s eta 0:00:26
    --------------------------------------- 3.7/150.0 MB 4.8 MB/s eta 0:00:31
   - -------------------------------------- 4.7/150.0 MB 4.8 MB/s eta 0:00:31
   - -------------------------------------- 5.2/150.0 MB 4.4 MB/s eta 0:00:34
   - -------------------------------------- 6.3/150.0 MB 4.4 MB/s eta 0:00:33
   -- ------------------------------------- 7.6/150.0 MB 4.7 MB/s eta 0:00:30
   -- -------

In [2]:
from pandas import DataFrame
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler


def drop_empty_items(data_frame: DataFrame) -> DataFrame:
    data_frame = data_frame.drop(['Insulin'], axis=1)
    return data_frame[(data_frame['Glucose'] != 0) & (data_frame['BloodPressure'] != 0) & (data_frame['BMI'] != 0)]


def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test


def print_score(y_test, y_pred) -> None:
    # Calculate other evaluation metrics for test set
    print(f"SIMPLE XGBoost")
    print("===========================================================================")
    print("\nClassification Report:\n",
          classification_report(y_test, y_pred, target_names=['non-diabetic', 'diabetic']))
    print("XGBoost Model Evaluation:")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print(f"Precision (Weighted): {precision:.6f}")
    print(f"Recall (Weighted): {recall:.6f}")
    print(f"F1-Score (Weighted): {f1:.6f}")

    print("Class distribution before SMOTE:", y_train.value_counts().to_dict())
    print("Class distribution after SMOTE:", pd.Series(y_train_smote).value_counts().to_dict())

    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"ROC-AUC: {roc_auc:.6f}")

    # Print confusion matrix with class labels
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_names = ['non-diabetic', 'diabetic']
    print("\nConfusion Matrix with Class Labels:")
    print(pd.DataFrame(conf_matrix, index=class_names, columns=class_names))

    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp)
    print("Specificity: ", specificity)
    sensitivity = tp / (tp + fn)
    print("Sensitivity: ", sensitivity)

In [3]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE

features = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'SkinThickness', 'DiabetesPedigreeFunction', 'Age']

df = pd.read_csv('data/diabetes.csv')
df = drop_empty_items(df)

# Feature/Output Separation
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale Features
X_train, X_test = scale_features(X_train, X_test)

# SMOTE - Artificial Data Creation
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'colsample_bytree': [0.8, 1]
}

# Initialize Random Forest and Grid Search
## Max Iteration Has not converged in 200, so changing it to 2000
## ERROR:
##  C:\Users\shoun\miniconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:690: ConvergenceWarning: Stochastic
##  Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.

grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit Grid Search on SMOTE-balanced training data
grid_search.fit(X_train_smote, y_train_smote)

# Print best parameters and score
print("\nBest Hyperparameters:")
print(grid_search.best_params_)
print(f"Best Cross-Validation F1-Weighted Score: {grid_search.best_score_:.4f}")

# Train best model on full SMOTE training data
best_rf = grid_search.best_estimator_
best_rf.fit(X_train_smote, y_train_smote)

# Evaluate best model on test set
y_pred = best_rf.predict(X_test)
print_score(y_test, y_pred)

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best Hyperparameters:
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best Cross-Validation F1-Weighted Score: 0.7996
SIMPLE XGBoost

Classification Report:
               precision    recall  f1-score   support

non-diabetic       0.89      0.80      0.85       102
    diabetic       0.62      0.77      0.69        43

    accuracy                           0.79       145
   macro avg       0.76      0.79      0.77       145
weighted avg       0.81      0.79      0.80       145

XGBoost Model Evaluation:
Accuracy:  0.7931034482758621
Precision (Weighted): 0.811632
Recall (Weighted): 0.793103
F1-Score (Weighted): 0.798547
Class distribution before SMOTE: {0: 373, 1: 206}
Class distribution after SMOTE: {0: 373, 1: 373}
ROC-AUC: 0.785682

Confusion Matrix with Class Labels:
              non-diabetic  diabetic
non-diabetic            82        20
diabetic                10        33
Specif