Gradient Boosting Model
----------------------------

In [1]:
# Getting the dataset from the .npz file
import numpy as np

np.random.seed(42)

# Loading the data (NON-PCA)
data = np.load("preprocessed_data.npz")
x_train_flat, y_train = data["x_train_flat"], data["y_train"]
x_val_flat, y_val = data["x_val_flat"], data["y_val"]
x_test_flat, y_test = data["x_test_flat"], data["y_test"]

# Loading the data (PCA)
data2 = np.load("preprocessed_data_pca95.npz")
x_train_pca95, y_train = data2["x_train_pca95"], data2["y_train"]
x_val_pca95, y_val = data2["x_val_pca95"], data2["y_val"]
x_test_pca95, y_test = data2["x_test_pca95"], data2["y_test"]

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time

def train_evaluate_gb(x_train, y_train, x_val, y_val, x_test, y_test, learning_rate, n_estimators):
    gb = GradientBoostingClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=3
    )

    start_time = time.time()
    gb.fit(x_train, y_train)
    train_time = time.time() - start_time

    y_train_pred = gb.predict(x_train)
    y_val_pred = gb.predict(x_val)
    y_test_pred = gb.predict(x_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    overfit_gap = train_acc - val_acc

    print(f"\nGradient Boosting Model (lr={learning_rate}, estimators={n_estimators}):")
    print(f"Train Accuracy:   {train_acc:.4f}")
    print(f"Val Accuracy:     {val_acc:.4f}")
    print(f"Test Accuracy:    {test_acc:.4f}")
    print(f"Overfitting Gap:  {overfit_gap:.4f}")
    print(f"Training Time:    {train_time:.2f} seconds")

    return {
        'train_acc': train_acc,
        'val_acc': val_acc,
        'test_acc': test_acc,
        'overfit_gap': overfit_gap,
        'train_time': train_time
    }


In [4]:
# Training 2 models on non-PCA dataset
# Moderate Regularization
gb1_results = train_evaluate_gb(x_train_flat, y_train, x_val_flat, y_val, x_test_flat, y_test, learning_rate=0.05, n_estimators=100)


Gradient Boosting Model (lr=0.05, estimators=100):
Train Accuracy:   0.8765
Val Accuracy:     0.8567
Test Accuracy:    0.8597
Overfitting Gap:  0.0198
Training Time:    5410.18 seconds


In [6]:
# Higher Regularization
gb2_results = train_evaluate_gb(x_train_flat, y_train, x_val_flat, y_val, x_test_flat, y_test, learning_rate=0.01, n_estimators=300)


Gradient Boosting Model (lr=0.01, estimators=300):
Train Accuracy:   0.8554
Val Accuracy:     0.8382
Test Accuracy:    0.8457
Overfitting Gap:  0.0172
Training Time:    15756.04 seconds


In [3]:
# Training the model on PCA Datasets
gb3_results = train_evaluate_gb(x_train_pca95, y_train, x_val_pca95, y_val, x_test_pca95, y_test, learning_rate=0.05, n_estimators=100)


Gradient Boosting Model (lr=0.05, estimators=100):
Train Accuracy:   0.8476
Val Accuracy:     0.8258
Test Accuracy:    0.8258
Overfitting Gap:  0.0218
Training Time:    4806.85 seconds


In [4]:
# Now using different parameters
gb4_results = train_evaluate_gb(x_train_pca95, y_train, x_val_pca95, y_val, x_test_pca95, y_test, learning_rate=0.01, n_estimators=300)


Gradient Boosting Model (lr=0.01, estimators=300):
Train Accuracy:   0.8196
Val Accuracy:     0.8027
Test Accuracy:    0.8048
Overfitting Gap:  0.0169
Training Time:    19224.58 seconds
