The **preprocessing** steps (removing outliers and one-hot encoding categorical variables) have already been accomplished in the Random Forest Model Training so we'll just load in the cleaned dataset

In [1]:
# Making sure XGBoost is installed in the environment
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Load the preprocessed and cleaned dataset
data_filepath = '../data/processed/cleaned_diabetes_one_hot_encoding.csv'
df = pd.read_csv(data_filepath)
print("\nThese are the number of people with and without diabetes")
print(df['diabetes'].value_counts())


These are the number of people with and without diabetes
0    82605
1     6632
Name: diabetes, dtype: int64


In [3]:
print("This is the Data distribution of all variables in the dataset")
print(df.describe()) # Data distribution of variables

This is the Data distribution of all variables in the dataset
                age  hypertension  heart_disease           bmi   HbA1c_level  \
count  89237.000000  89237.000000   89237.000000  89237.000000  89237.000000   
mean      41.665855      0.069086       0.037484     26.357026      5.511976   
std       22.777219      0.253601       0.189947      4.901841      1.060805   
min        0.080000      0.000000       0.000000     14.710000      3.500000   
25%       23.000000      0.000000       0.000000     23.420000      4.800000   
50%       42.000000      0.000000       0.000000     27.320000      5.800000   
75%       60.000000      0.000000       0.000000     28.350000      6.200000   
max       80.000000      1.000000       1.000000     38.500000      9.000000   

       blood_glucose_level      diabetes  gender_Female   gender_Male  \
count         89237.000000  89237.000000   89237.000000  89237.000000   
mean            137.445096      0.074319       0.580062      0.419759  

In [4]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0


## Model Building - Initial

In [5]:
# Separating the features (X) and the target variable (y)
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [6]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Converting the dataset into XGBoost's DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [7]:
# Specifying the XGBoost model parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',  # Evaluation metric
    'max_depth': 6,  # Depth of the trees
    'eta': 0.3,  # Learning rate
    'seed': 42  # Random seed for reproducibility
}
num_rounds = 100  # Number of boosting rounds

# Training the XGBoost model
bst = xgb.train(params, dtrain, num_rounds)

In [8]:
# Making predictions on the test set
y_pred_prob = bst.predict(dtest)
y_pred = np.where(y_pred_prob > 0.5, 1, 0)  # Thresholding to get binary predictions

In [9]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)  # Use probabilities to compute ROC AUC
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Displaying the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.972658000896459
ROC AUC: 0.9749866407052354
Confusion Matrix:
[[16467    38]
 [  450   893]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     16505
           1       0.96      0.66      0.79      1343

    accuracy                           0.97     17848
   macro avg       0.97      0.83      0.89     17848
weighted avg       0.97      0.97      0.97     17848



### Extensive Hyperparameter Tuning

In [10]:
'''from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# larger grid of hyperparameters to search
param_dist = {
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.2)
}

# Randomized Search on hyper parameters
random_search = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42), 
                                   param_distributions=param_dist, n_iter=100, scoring='roc_auc', 
                                   n_jobs=-1, cv=5, verbose=3, random_state=42)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters and highest ROC AUC
print("Best Parameters:", random_search.best_params_)
print("Best ROC AUC Score:", random_search.best_score_)'''

''' 
Best parameters identfied to avoid tuning again:
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'colsample_bytree': 0.9400154311159197, 'learning_rate': 0.09989013482764068, 'max_depth': 3, 'min_child_weight': 7, 'n_estimators': 161, 'subsample': 0.7300733288106989}
Best ROC AUC Score: 0.9792405387270247
'''

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'colsample_bytree': 0.9400154311159197, 'learning_rate': 0.09989013482764068, 'max_depth': 3, 'min_child_weight': 7, 'n_estimators': 161, 'subsample': 0.7300733288106989}
Best ROC AUC Score: 0.9792405387270247


' \nBest parameters identfied to avoid tuning again:\n\n'

In [11]:
# Use the best parameters identified from the tuning process directly
optimized_model = xgb.XGBClassifier(
    colsample_bytree=0.9400154311159197,
    learning_rate=0.09989013482764068,
    max_depth=3,
    min_child_weight=7,
    n_estimators=161,
    subsample=0.7300733288106989,
    objective='binary:logistic',
    seed=42
)

optimized_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = optimized_model.predict(X_test)
y_pred_prob = optimized_model.predict_proba(X_test)[:, 1]

# Compute and print the evaluation metrics
accuracy_optimized = accuracy_score(y_test, y_pred)
roc_auc_optimized = roc_auc_score(y_test, y_pred_prob)
conf_matrix_optimized = confusion_matrix(y_test, y_pred)
class_report_optimized = classification_report(y_test, y_pred)

print(f"Optimized Model Accuracy: {accuracy_optimized}")
print(f"Optimized Model ROC AUC: {roc_auc_optimized}")
print("Optimized Model Confusion Matrix:")
print(conf_matrix_optimized)
print("Optimized Model Classification Report:")
print(class_report_optimized)


Optimized Model Accuracy: 0.9731622590766472
Optimized Model ROC AUC: 0.9776822069081258
Optimized Model Confusion Matrix:
[[16488    17]
 [  462   881]]
Optimized Model Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     16505
           1       0.98      0.66      0.79      1343

    accuracy                           0.97     17848
   macro avg       0.98      0.83      0.89     17848
weighted avg       0.97      0.97      0.97     17848

[CV 1/5] END colsample_bytree=0.749816047538945, learning_rate=0.20014286128198325, max_depth=5, min_child_weight=8, n_estimators=288, subsample=0.8387400631785948;, score=0.974 total time=  15.9s
[CV 1/5] END colsample_bytree=0.7123738038749523, learning_rate=0.1185392166316497, max_depth=3, min_child_weight=1, n_estimators=444, subsample=0.9947547746402069;, score=0.977 total time=  11.7s
[CV 2/5] END colsample_bytree=0.7901480892728447, learning_rate=0.12265511439527674, m

[CV 5/5] END colsample_bytree=0.996884623716487, learning_rate=0.13349630192554332, max_depth=4, min_child_weight=6, n_estimators=352, subsample=0.7727780074568463;, score=0.977 total time=  17.9s
[CV 2/5] END colsample_bytree=0.7123738038749523, learning_rate=0.1185392166316497, max_depth=3, min_child_weight=1, n_estimators=444, subsample=0.9947547746402069;, score=0.979 total time=  11.7s
[CV 5/5] END colsample_bytree=0.9022204554172195, learning_rate=0.0557596330983245, max_depth=9, min_child_weight=3, n_estimators=185, subsample=0.9521871356061031;, score=0.978 total time=  17.0s
[CV 5/5] END colsample_bytree=0.695824756266789, learning_rate=0.03897897441824462, max_depth=6, min_child_weight=6, n_estimators=346, subsample=0.8688542189623514;, score=0.979 total time=  18.5s
[CV 2/5] END colsample_bytree=0.7221455441377573, learning_rate=0.10712275071724532, max_depth=5, min_child_weight=9, n_estimators=260, subsample=0.6072301454462083;, score=0.979 total time=  12.9s
[CV 3/5] END c

[CV 2/5] END colsample_bytree=0.8832290311184181, learning_rate=0.014116898859160489, max_depth=4, min_child_weight=8, n_estimators=393, subsample=0.6003115063364057;, score=0.977 total time=  19.7s
[CV 1/5] END colsample_bytree=0.9160702162124823, learning_rate=0.1311919949562023, max_depth=4, min_child_weight=7, n_estimators=140, subsample=0.9659838702175123;, score=0.977 total time=   5.7s
[CV 4/5] END colsample_bytree=0.885297914889198, learning_rate=0.1621570097233795, max_depth=4, min_child_weight=7, n_estimators=354, subsample=0.7024273291045295;, score=0.978 total time=  15.6s
[CV 4/5] END colsample_bytree=0.881207583558071, learning_rate=0.08272592047585879, max_depth=3, min_child_weight=4, n_estimators=367, subsample=0.7007129183301457;, score=0.980 total time=  12.3s
[CV 5/5] END colsample_bytree=0.6905983100791752, learning_rate=0.13903455808189, max_depth=6, min_child_weight=9, n_estimators=227, subsample=0.8071005402109921;, score=0.977 total time=  12.4s
[CV 2/5] END col

[CV 5/5] END colsample_bytree=0.6053059844639466, learning_rate=0.19844035113697056, max_depth=8, min_child_weight=2, n_estimators=364, subsample=0.6063865008880857;, score=0.970 total time=  29.5s
[CV 4/5] END colsample_bytree=0.9022204554172195, learning_rate=0.0557596330983245, max_depth=9, min_child_weight=3, n_estimators=185, subsample=0.9521871356061031;, score=0.979 total time=  17.0s
[CV 4/5] END colsample_bytree=0.695824756266789, learning_rate=0.03897897441824462, max_depth=6, min_child_weight=6, n_estimators=346, subsample=0.8688542189623514;, score=0.980 total time=  18.9s
[CV 4/5] END colsample_bytree=0.7221455441377573, learning_rate=0.10712275071724532, max_depth=5, min_child_weight=9, n_estimators=260, subsample=0.6072301454462083;, score=0.979 total time=  13.1s
[CV 2/5] END colsample_bytree=0.941203782186944, learning_rate=0.06888977841391714, max_depth=7, min_child_weight=7, n_estimators=260, subsample=0.726768802062511;, score=0.979 total time=  20.7s
[CV 5/5] END c

[CV 5/5] END colsample_bytree=0.6923575302488596, learning_rate=0.05820509320520235, max_depth=6, min_child_weight=7, n_estimators=363, subsample=0.6137554084460873;, score=0.978 total time=  23.3s
[CV 2/5] END colsample_bytree=0.8918424713352255, learning_rate=0.13751149427104264, max_depth=5, min_child_weight=3, n_estimators=326, subsample=0.6478376983753207;, score=0.977 total time=  19.2s
[CV 5/5] END colsample_bytree=0.7988994023569542, learning_rate=0.07017566196335394, max_depth=3, min_child_weight=8, n_estimators=180, subsample=0.706712405710114;, score=0.978 total time=   5.7s
[CV 5/5] END colsample_bytree=0.9933692563579372, learning_rate=0.08976488848891061, max_depth=9, min_child_weight=1, n_estimators=300, subsample=0.7283120259886944;, score=0.974 total time=  32.5s
[CV 2/5] END colsample_bytree=0.6677970986744369, learning_rate=0.12136025249167003, max_depth=9, min_child_weight=6, n_estimators=157, subsample=0.9720067339243328;, score=0.978 total time=  12.3s
[CV 3/5] EN

[CV 1/5] END colsample_bytree=0.8832290311184181, learning_rate=0.014116898859160489, max_depth=4, min_child_weight=8, n_estimators=393, subsample=0.6003115063364057;, score=0.975 total time=  19.4s
[CV 3/5] END colsample_bytree=0.908897907718663, learning_rate=0.04974313630683449, max_depth=9, min_child_weight=3, n_estimators=180, subsample=0.88453678109946;, score=0.979 total time=  16.9s
[CV 3/5] END colsample_bytree=0.7219125032632117, learning_rate=0.04293117062858835, max_depth=5, min_child_weight=9, n_estimators=417, subsample=0.6888431241882921;, score=0.979 total time=  20.5s
[CV 1/5] END colsample_bytree=0.9985014799031697, learning_rate=0.20308387025775876, max_depth=7, min_child_weight=5, n_estimators=259, subsample=0.8532405829093072;, score=0.970 total time=  19.4s
[CV 4/5] END colsample_bytree=0.858188918362867, learning_rate=0.045422135881409795, max_depth=6, min_child_weight=5, n_estimators=105, subsample=0.7480634801021777;, score=0.978 total time=   6.7s
[CV 3/5] END

[CV 4/5] END colsample_bytree=0.6053059844639466, learning_rate=0.19844035113697056, max_depth=8, min_child_weight=2, n_estimators=364, subsample=0.6063865008880857;, score=0.971 total time=  29.1s
[CV 1/5] END colsample_bytree=0.9022204554172195, learning_rate=0.0557596330983245, max_depth=9, min_child_weight=3, n_estimators=185, subsample=0.9521871356061031;, score=0.977 total time=  17.1s
[CV 2/5] END colsample_bytree=0.695824756266789, learning_rate=0.03897897441824462, max_depth=6, min_child_weight=6, n_estimators=346, subsample=0.8688542189623514;, score=0.980 total time=  18.8s
[CV 1/5] END colsample_bytree=0.7221455441377573, learning_rate=0.10712275071724532, max_depth=5, min_child_weight=9, n_estimators=260, subsample=0.6072301454462083;, score=0.976 total time=  13.1s
[CV 4/5] END colsample_bytree=0.606182646611547, learning_rate=0.1956637125175451, max_depth=3, min_child_weight=5, n_estimators=194, subsample=0.9854479908357011;, score=0.979 total time=   4.8s
[CV 2/5] END c

[CV 4/5] END colsample_bytree=0.8832290311184181, learning_rate=0.014116898859160489, max_depth=4, min_child_weight=8, n_estimators=393, subsample=0.6003115063364057;, score=0.978 total time=  20.0s
[CV 5/5] END colsample_bytree=0.9160702162124823, learning_rate=0.1311919949562023, max_depth=4, min_child_weight=7, n_estimators=140, subsample=0.9659838702175123;, score=0.979 total time=   6.1s
[CV 2/5] END colsample_bytree=0.6161734358153725, learning_rate=0.15213257793715748, max_depth=4, min_child_weight=3, n_estimators=162, subsample=0.9583054382694077;, score=0.980 total time=   5.5s
[CV 1/5] END colsample_bytree=0.9627313766183017, learning_rate=0.06442644987692707, max_depth=5, min_child_weight=9, n_estimators=215, subsample=0.7410275425336676;, score=0.977 total time=  12.5s
[CV 4/5] END colsample_bytree=0.9906459823330611, learning_rate=0.09220740266364626, max_depth=9, min_child_weight=5, n_estimators=323, subsample=0.9633063543866615;, score=0.977 total time=  30.1s
[CV 4/5] E

[CV 4/5] END colsample_bytree=0.7164916560792167, learning_rate=0.1323705789444759, max_depth=4, min_child_weight=3, n_estimators=463, subsample=0.8056937753654446;, score=0.977 total time=  19.9s
[CV 3/5] END colsample_bytree=0.9160702162124823, learning_rate=0.1311919949562023, max_depth=4, min_child_weight=7, n_estimators=140, subsample=0.9659838702175123;, score=0.980 total time=   5.7s
[CV 5/5] END colsample_bytree=0.885297914889198, learning_rate=0.1621570097233795, max_depth=4, min_child_weight=7, n_estimators=354, subsample=0.7024273291045295;, score=0.978 total time=  15.6s
[CV 1/5] END colsample_bytree=0.7988994023569542, learning_rate=0.07017566196335394, max_depth=3, min_child_weight=8, n_estimators=180, subsample=0.706712405710114;, score=0.977 total time=   5.8s
[CV 3/5] END colsample_bytree=0.9046478461314871, learning_rate=0.05752750879847994, max_depth=9, min_child_weight=6, n_estimators=379, subsample=0.7881202537784153;, score=0.975 total time=  36.8s
[CV 4/5] END co

[CV 1/5] END colsample_bytree=0.8270801311279966, learning_rate=0.016262658491111717, max_depth=4, min_child_weight=2, n_estimators=301, subsample=0.9579309401710595;, score=0.974 total time=  13.4s
[CV 4/5] END colsample_bytree=0.8391599915244341, learning_rate=0.19437484700462337, max_depth=8, min_child_weight=7, n_estimators=351, subsample=0.6180909155642152;, score=0.974 total time=  29.4s
[CV 1/5] END colsample_bytree=0.9906459823330611, learning_rate=0.09220740266364626, max_depth=9, min_child_weight=5, n_estimators=323, subsample=0.9633063543866615;, score=0.974 total time=  29.5s
[CV 2/5] END colsample_bytree=0.9915571433100037, learning_rate=0.10734843059189102, max_depth=5, min_child_weight=2, n_estimators=376, subsample=0.8010548372420768;, score=0.978 total time=  20.6s
[CV 1/5] END colsample_bytree=0.9927363553242124, learning_rate=0.17778670041387268, max_depth=4, min_child_weight=6, n_estimators=404, subsample=0.6149392754996857;, score=0.974 total time=  19.2s
[CV 3/5] 

[CV 1/5] END colsample_bytree=0.9637281608315128, learning_rate=0.061755996320003385, max_depth=6, min_child_weight=2, n_estimators=489, subsample=0.6831766651472755;, score=0.975 total time=  37.9s
[CV 3/5] END colsample_bytree=0.6479461469334731, learning_rate=0.07752303428072559, max_depth=9, min_child_weight=6, n_estimators=259, subsample=0.8075162486973464;, score=0.976 total time=  21.3s
[CV 2/5] END colsample_bytree=0.7356119164194803, learning_rate=0.07984191492253218, max_depth=5, min_child_weight=4, n_estimators=251, subsample=0.761803250848876;, score=0.979 total time=  12.3s
[CV 5/5] END colsample_bytree=0.637469907131237, learning_rate=0.0835431606118867, max_depth=9, min_child_weight=6, n_estimators=337, subsample=0.8359483390242175;, score=0.976 total time=  25.6s
[CV 3/5] END colsample_bytree=0.7645415620226714, learning_rate=0.1499024421534388, max_depth=6, min_child_weight=2, n_estimators=223, subsample=0.8858380416719809;, score=0.977 total time=  12.8s
[CV 5/5] END 

### Feature Engineering before balancing the Dataset

In [12]:
data_filepath = '../data/processed/cleaned_diabetes_with_feature_engineering.csv'
df_enhanced = pd.read_csv(data_filepath)

X_enhanced = df_enhanced.drop('diabetes', axis=1)
Y_enhanced = df_enhanced['diabetes']

# Splitting the data into training and testing sets
X_train_enhanced, X_test_enhanced, Y_train_enhanced, Y_test_enhanced = train_test_split(X_enhanced, Y_enhanced, test_size=0.2, random_state=42)

# Training the XGBoost Model with Enhanced Features
dtrain_enhanced = xgb.DMatrix(X_train_enhanced, label=Y_train_enhanced)
dtest_enhanced = xgb.DMatrix(X_test_enhanced, label=Y_test_enhanced)

# Reusing the optimized hyperparameters
params_optimized = {
    'colsample_bytree': 0.9400154311159197,
    'learning_rate': 0.09989013482764068,
    'max_depth': 3,
    'min_child_weight': 7,
    'n_estimators': 161,
    'subsample': 0.7300733288106989,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

# Train the model
bst_enhanced = xgb.train(params_optimized, dtrain_enhanced, num_rounds)

# Evaluating the enhanced model
# Making predictions on the test set
y_pred_prob_enhanced = bst_enhanced.predict(dtest_enhanced)
y_pred_enhanced = np.where(y_pred_prob_enhanced > 0.5, 1, 0)

# Evaluation
accuracy_enhanced = accuracy_score(Y_test_enhanced, y_pred_enhanced)
roc_auc_enhanced = roc_auc_score(Y_test_enhanced, y_pred_prob_enhanced)
conf_matrix_enhanced = confusion_matrix(Y_test_enhanced, y_pred_enhanced)
class_report_enhanced = classification_report(Y_test_enhanced, y_pred_enhanced)

# Display the evaluation metrics
print(f"Enhanced Model with Feature Engineering Accuracy: {accuracy_enhanced}")
print(f"Enhanced Model with Feature Engineering ROC AUC: {roc_auc_enhanced}")
print("Enhanced Model with Feature Engineering Confusion Matrix:\n", conf_matrix_enhanced)
print("Enhanced Model with Feature Engineering Classification Report:\n", class_report_enhanced)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Enhanced Model with Feature Engineering Accuracy: 0.973722545943523
Enhanced Model with Feature Engineering ROC AUC: 0.9765010174267461
Enhanced Model with Feature Engineering Confusion Matrix:
 [[16502     3]
 [  466   877]]
Enhanced Model with Feature Engineering Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     16505
           1       1.00      0.65      0.79      1343

    accuracy                           0.97     17848
   macro avg       0.98      0.83      0.89     17848
weighted avg       0.97      0.97      0.97     17848



### Addressing Class Imbalance

In [13]:
from xgboost import XGBClassifier
# Calculate the scale_pos_weight value
# This is usually total_negative_examples / total_positive_examples
scale_pos_weight_value = Y_train_enhanced.value_counts()[0] / Y_train_enhanced.value_counts()[1]

# Initialize the XGBoost model with optimized parameters and scale_pos_weight
xgb_model = XGBClassifier(
    colsample_bytree=0.9400154311159197,
    learning_rate=0.09989013482764068,
    max_depth=3,
    min_child_weight=7,
    n_estimators=161,
    subsample=0.7300733288106989,
    objective='binary:logistic',
    eval_metric='logloss',
    seed=42,
    scale_pos_weight=scale_pos_weight_value
)

# Train the model
xgb_model.fit(X_train_enhanced, Y_train_enhanced)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test_enhanced)
y_pred_prob = xgb_model.predict_proba(X_test_enhanced)[:, 1]

# Compute and print the evaluation metrics
accuracy_optimized = accuracy_score(Y_test_enhanced, y_pred)
roc_auc_optimized = roc_auc_score(Y_test_enhanced, y_pred_prob)
conf_matrix_optimized = confusion_matrix(Y_test_enhanced, y_pred)
class_report_optimized = classification_report(Y_test_enhanced, y_pred)

print(f"Optimized Model with Class Balancing Accuracy: {accuracy_optimized}")
print(f"Optimized Model with Class Balancing ROC AUC: {roc_auc_optimized}")
print("Optimized Model with Class Balancing Confusion Matrix:\n", conf_matrix_optimized)
print("Optimized Model with Class Balancing Classification Report:\n", class_report_optimized)

Optimized Model with Class Balancing Accuracy: 0.8998767368892873
Optimized Model with Class Balancing ROC AUC: 0.9778053898692222
Optimized Model with Class Balancing Confusion Matrix:
 [[14827  1678]
 [  109  1234]]
Optimized Model with Class Balancing Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.90      0.94     16505
           1       0.42      0.92      0.58      1343

    accuracy                           0.90     17848
   macro avg       0.71      0.91      0.76     17848
weighted avg       0.95      0.90      0.92     17848



#### Threshold Tuning

In [14]:
from sklearn.metrics import precision_recall_curve

# Calculate precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(Y_test_enhanced, y_pred_prob)

# Find threshold that maximizes F1 score
f1_scores = 2*recall*precision / (recall + precision)
best_threshold = thresholds[np.argmax(f1_scores)]

# Apply threshold to positive probabilities to create new predictions
y_pred_optimized = (y_pred_prob >= best_threshold).astype(int)

# Evaluate these new predictions
print("Optimized Model with Best Threshold Accuracy:", accuracy_score(Y_test_enhanced, y_pred_optimized))
print("Optimized Model with Best Threshold Confusion Matrix:\n", confusion_matrix(Y_test_enhanced, y_pred_optimized))
print("Optimized Model with Best Threshold Classification Report:\n", classification_report(Y_test_enhanced, y_pred_optimized))

Optimized Model with Best Threshold Accuracy: 0.9738906320035858
Optimized Model with Best Threshold Confusion Matrix:
 [[16504     1]
 [  465   878]]
Optimized Model with Best Threshold Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     16505
           1       1.00      0.65      0.79      1343

    accuracy                           0.97     17848
   macro avg       0.99      0.83      0.89     17848
weighted avg       0.97      0.97      0.97     17848



#### Cross Validation

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

# Assuming X and y are features and labels
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for train_index, test_index in kfold.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    model = xgb.XGBClassifier(
        colsample_bytree=0.9400154311159197,
        learning_rate=0.09989013482764068,
        max_depth=3,
        min_child_weight=7,
        n_estimators=161,
        subsample=0.7300733288106989,
        objective='binary:logistic',
        eval_metric='logloss',
        seed=42
    )
    
    model.fit(X_train_fold, y_train_fold)
    y_pred_prob = model.predict_proba(X_test_fold)[:, 1]
    roc_auc = roc_auc_score(y_test_fold, y_pred_prob)
    results.append(roc_auc)

print(f"Mean ROC AUC: {np.mean(results)}")

# Retrain the model on the entire training data using best hyperparameters
final_model = xgb.XGBClassifier(
    colsample_bytree=0.9400154311159197,
    learning_rate=0.09989013482764068,
    max_depth=3,
    min_child_weight=7,
    n_estimators=161,
    subsample=0.7300733288106989,
    objective='binary:logistic',
    eval_metric='logloss',
    seed=42
)

final_model.fit(X_train, y_train)

# Evaluate on a separate test set
y_pred_prob_final = final_model.predict_proba(X_test)[:, 1]
final_roc_auc = roc_auc_score(y_test, y_pred_prob_final)

print(f"Final Model ROC AUC on Test Set: {final_roc_auc}")

Mean ROC AUC: 0.9789041548210553
Final Model ROC AUC on Test Set: 0.9776822069081258


#### Baysian Optimization

In [18]:
!pip install hyperopt

Defaulting to user installation because normal site-packages is not writeable
Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting future
  Downloading future-1.0.0-py3-none-any.whl (491 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.3/491.3 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: py4j, future, hyperopt
Successfully installed future-1.0.0 hyperopt-0.2.7 py4j-0.10.9.7


In [19]:
'''

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

space = {
    'max_depth': hp.choice('max_depth', range(3, 11)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
    'n_estimators': hp.choice('n_estimators', range(100, 500)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
}

def objective(space):
    model = xgb.XGBClassifier(
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        n_estimators=int(space['n_estimators']),
        learning_rate=space['learning_rate'],
        objective='binary:logistic',
        random_state=42
    )
    
    auc = cross_val_score(model, X, y, scoring='roc_auc', cv=StratifiedKFold(n_splits=5)).mean()
    return {'loss': -auc, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

print("Best parameters:", best) '''

'''
100%|██████| 100/100 [31:16<00:00, 18.76s/trial, best loss: -0.9790003751509359]
Best parameters: {'colsample_bytree': 0.8647783037718892, 'learning_rate': 0.04840129264965742, 'max_depth': 1, 'min_child_weight': 1.0, 'n_estimators': 329, 'subsample': 0.9160065601766973}

'''

100%|██████| 100/100 [31:16<00:00, 18.76s/trial, best loss: -0.9790003751509359]
Best parameters: {'colsample_bytree': 0.8647783037718892, 'learning_rate': 0.04840129264965742, 'max_depth': 1, 'min_child_weight': 1.0, 'n_estimators': 329, 'subsample': 0.9160065601766973}


In [None]:
best_params = {
    'colsample_bytree': 0.8647783037718892,
    'learning_rate': 0.04840129264965742,
    'max_depth': 1,
    'min_child_weight': 1.0,
    'n_estimators': 329,
    'subsample': 0.9160065601766973
}

# Initialize the model with Bayesian Optimization's best parameters
optimized_model_bo = xgb.XGBClassifier(
    colsample_bytree=best_params['colsample_bytree'],
    learning_rate=best_params['learning_rate'],
    max_depth=3 + best_params['max_depth']
    min_child_weight=best_params['min_child_weight'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    objective='binary:logistic',
    seed=42
)

# Train the model on the entire training dataset
optimized_model_bo.fit(X_train, y_train)

# Making predictions on the test set
y_pred_bo = optimized_model_bo.predict(X_test)
y_pred_prob_bo = optimized_model_bo.predict_proba(X_test)[:, 1]

# Compute and print the evaluation metrics
accuracy_optimized_bo = accuracy_score(y_test, y_pred_bo)
roc_auc_optimized_bo = roc_auc_score(y_test, y_pred_prob_bo)
conf_matrix_optimized_bo = confusion_matrix(y_test, y_pred_bo)
class_report_optimized_bo = classification_report(y_test, y_pred_bo)

# Display the evaluation metrics
print(f"Optimized Model with Bayesian Optimization Accuracy: {accuracy_optimized_bo}")
print(f"Optimized Model with Bayesian Optimization ROC AUC: {roc_auc_optimized_bo}")
print("Optimized Model with Bayesian Optimization Confusion Matrix:")
print(conf_matrix_optimized_bo)
print("Optimized Model with Bayesian Optimization Classification Report:")
print(class_report_optimized_bo)
