In [1]:
import os
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump

In [2]:
# Datasets Paths
X_train_resampled_path = os.path.join('..','datasets','prepared_data','X_train_resampled.csv') # SMOTE applied  
X_train_reduced_path = os.path.join('..','datasets','prepared_data','X_train_reduced.csv')     # PCA applied (It was resampled before)
X_test_path = os.path.join('..','datasets','prepared_data','X_test_transformed.csv') 
X_test_reduced_path = os.path.join('..','datasets','prepared_data','X_test_reduced.csv') 
y_train_path = os.path.join('..','datasets','prepared_data','y_train_resampled.csv') # SMOTE applied
y_test_path = os.path.join('..','datasets','prepared_data','y_test.csv')

# Load Datasets
X_train_resampled = pd.read_csv(X_train_resampled_path) # PCA Reduced Dataset
X_train_reduced = pd.read_csv(X_train_reduced_path)     # PCA applied (It was resampled before)
X_test_reduced = pd.read_csv(X_test_reduced_path)               # PCA
X_test = pd.read_csv(X_test_path)             
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Ensure y are 1D arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [3]:
# Define the base model
xgb = XGBClassifier(eval_metric='logloss', random_state=42)

# Define the hyperparameters to search
param_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='recall',  # Change to 'roc_auc' or 'f1' if needed
    cv=5,                # Number of cross-validation folds
    n_jobs=-1,           # Use all available CPU cores
    verbose=1            # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_resampled, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.8}
Best score: 0.8677241379310345


In [4]:
y_pred = best_model.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83      1552
           1       0.55      0.74      0.63       561

    accuracy                           0.77      2113
   macro avg       0.72      0.76      0.73      2113
weighted avg       0.80      0.77      0.78      2113



In [6]:
# Save trained model
dump(best_model, os.path.join('..','trained_models','xgboosting.joblib'))

['..\\trained_models\\xgboosting.joblib']

### With Reduced Dimensionality

In [7]:
# Define the base model
xgb = XGBClassifier(eval_metric='logloss', random_state=42)

# Define the hyperparameters to search
param_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='recall',  # Change to 'roc_auc' or 'f1' if needed
    cv=5,                # Number of cross-validation folds
    n_jobs=-1,           # Use all available CPU cores
    verbose=1            # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_reduced, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model_w_reduced = grid_search.best_estimator_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150, 'subsample': 1.0}
Best score: 0.8555793484473233


In [8]:
y_pred = best_model_w_reduced.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.76      0.82      1552
           1       0.52      0.71      0.60       561

    accuracy                           0.75      2113
   macro avg       0.70      0.74      0.71      2113
weighted avg       0.78      0.75      0.76      2113

