In [1]:
import joblib
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


# Load training and test data

X_train = pd.read_csv("../Data/X_train.csv")
X_test = pd.read_csv("../Data/X_test.csv")
y_train = pd.read_csv("../Data/y_train.csv").values.ravel()
y_test = pd.read_csv("../Data/y_test.csv").values.ravel()


# Load saved best model

best_model = joblib.load("../models/best_model.pkl")


In [2]:
# Define hyperparameter grid for tuning

param_grid = {
    'n_estimators': [50, 100, 200],      #  Number of trees
    'learning_rate': [0.01, 0.1, 0.2],   #  How much each tree contributes
    'max_depth': [3, 5, 7],              #  Depth of each tree
    'subsample': [0.8, 1.0],             #  Percentage of samples
} 

In [3]:
# Perform exhaustive search

grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters: ", grid_search.best_params_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best Hyperparameters:  {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.8}


In [9]:
# Randomized search for best parameters

random_search = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, cv=3, scoring='accuracy', n_iter=10, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

print("Best Hyperparameters (Randomized Search): ", random_search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Hyperparameters (Randomized Search):  {'subsample': 0.8, 'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.01}


In [10]:
# Using best parameters to train a final optimized model

best_params = random_search.best_params_    #  Parameters from randomized search


# Training optimized Gradient Boosting Model

tuned_gbm = GradientBoostingClassifier(**best_params, random_state=42)
tuned_gbm.fit(X_train, y_train)


# Make predictions

y_pred_tuned = tuned_gbm.predict(X_test)


# Evaluate performance

print("Optimized Gradient Boosting Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_tuned))
print(classification_report(y_test, y_pred_tuned))

Optimized Gradient Boosting Performance:
Accuracy: 0.9999663684670748
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22494
           1       1.00      1.00      1.00      7240

    accuracy                           1.00     29734
   macro avg       1.00      1.00      1.00     29734
weighted avg       1.00      1.00      1.00     29734



In [11]:
# Saving the optimized model

joblib.dump(tuned_gbm, "../models/best_tuned_model.pkl")
print("Best tuned model saved as '../models/best_tuned_model.pkl'")

Best tuned model saved as '../models/best_tuned_model.pkl'
