# Hyperparameter Tuning

This notebook focuses on advanced model tuning using Bagging and Boosting techniques with GridSearchCV.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    BaggingClassifier, BaggingRegressor,
    RandomForestClassifier, RandomForestRegressor,
    AdaBoostClassifier, GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Load data from trainAndFeatureEngineer.ipynb
X_train = np.load('data/X_train_fe.npy')
X_test = np.load('data/X_test_fe.npy')
y_train = np.load('data/y_train_encoded.npy')
y_test = np.load('data/y_test_encoded.npy')

# Load label encoder for reference
le = joblib.load('data/label_encoder.joblib')

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')
print(f'Classes: {le.classes_}')

## Bagging Models

Bagging (Bootstrap Aggregating) reduces variance by training multiple models on different subsets of data.

In [None]:
# Bagging with Decision Tree base estimator
bagging_dt_params = {
    'n_estimators': [10, 25, 50],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'bootstrap': [True],
    'bootstrap_features': [False, True]
}

bagging_dt = GridSearchCV(
    BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=42, max_depth=10),
        random_state=42
    ),
    bagging_dt_params,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

bagging_dt.fit(X_train, y_train)

print(f'\nBest Bagging (DT) Parameters: {bagging_dt.best_params_}')
print(f'Best CV F1 Score: {bagging_dt.best_score_:.4f}')

bagging_dt_pred = bagging_dt.predict(X_test)
print(f'Test Accuracy: {accuracy_score(y_test, bagging_dt_pred):.4f}')
print(f'Test F1 Score: {f1_score(y_test, bagging_dt_pred, average="weighted"):.4f}')

In [None]:
# Bagging with DecisionTreeRegressor as base (for comparison)
bagging_reg_params = {
    'n_estimators': [10, 25, 50],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

bagging_reg = GridSearchCV(
    BaggingRegressor(
        estimator=DecisionTreeRegressor(random_state=42, max_depth=10),
        random_state=42
    ),
    bagging_reg_params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

bagging_reg.fit(X_train, y_train)

print(f'\nBest Bagging (Regressor) Parameters: {bagging_reg.best_params_}')
print(f'Best CV MSE: {-bagging_reg.best_score_:.4f}')

# Convert regression predictions to classification
bagging_reg_pred = np.clip(np.round(bagging_reg.predict(X_test)), 0, 2).astype(int)
print(f'Test Accuracy: {accuracy_score(y_test, bagging_reg_pred):.4f}')
print(f'Test F1 Score: {f1_score(y_test, bagging_reg_pred, average="weighted"):.4f}')

In [None]:
# Random Forest Classifier
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

print(f'\nBest Random Forest Parameters: {rf_grid.best_params_}')
print(f'Best CV F1 Score: {rf_grid.best_score_:.4f}')

rf_pred = rf_grid.predict(X_test)
print(f'Test Accuracy: {accuracy_score(y_test, rf_pred):.4f}')
print(f'Test F1 Score: {f1_score(y_test, rf_pred, average="weighted"):.4f}')

In [None]:
# Random Forest Regressor (for comparison)
rf_reg_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

rf_reg_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    rf_reg_params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

rf_reg_grid.fit(X_train, y_train)

print(f'\nBest RF Regressor Parameters: {rf_reg_grid.best_params_}')
print(f'Best CV MSE: {-rf_reg_grid.best_score_:.4f}')

# Convert regression predictions to classification
rf_reg_pred = np.clip(np.round(rf_reg_grid.predict(X_test)), 0, 2).astype(int)
print(f'Test Accuracy: {accuracy_score(y_test, rf_reg_pred):.4f}')
print(f'Test F1 Score: {f1_score(y_test, rf_reg_pred, average="weighted"):.4f}')

## Boosting Models

Boosting reduces bias by sequentially training models where each model focuses on the errors of the previous one.

In [None]:
# AdaBoost Classifier
ada_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'algorithm': ['SAMME']
}

ada_grid = GridSearchCV(
    AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=3, random_state=42),
        random_state=42
    ),
    ada_params,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

ada_grid.fit(X_train, y_train)

print(f'\nBest AdaBoost Parameters: {ada_grid.best_params_}')
print(f'Best CV F1 Score: {ada_grid.best_score_:.4f}')

ada_pred = ada_grid.predict(X_test)
print(f'Test Accuracy: {accuracy_score(y_test, ada_pred):.4f}')
print(f'Test F1 Score: {f1_score(y_test, ada_pred, average="weighted"):.4f}')

In [None]:
# Gradient Boosting Classifier
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'subsample': [0.8, 1.0]
}

gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

gb_grid.fit(X_train, y_train)

print(f'\nBest Gradient Boosting Parameters: {gb_grid.best_params_}')
print(f'Best CV F1 Score: {gb_grid.best_score_:.4f}')

gb_pred = gb_grid.predict(X_test)
print(f'Test Accuracy: {accuracy_score(y_test, gb_pred):.4f}')
print(f'Test F1 Score: {f1_score(y_test, gb_pred, average="weighted"):.4f}')

In [None]:
# XGBoost (if available)
try:
    from xgboost import XGBClassifier
    
    xgb_params = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    xgb_grid = GridSearchCV(
        XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
        xgb_params,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    
    xgb_grid.fit(X_train, y_train)
    
    print(f'\nBest XGBoost Parameters: {xgb_grid.best_params_}')
    print(f'Best CV F1 Score: {xgb_grid.best_score_:.4f}')
    
    xgb_pred = xgb_grid.predict(X_test)
    print(f'Test Accuracy: {accuracy_score(y_test, xgb_pred):.4f}')
    print(f'Test F1 Score: {f1_score(y_test, xgb_pred, average="weighted"):.4f}')
    
    xgb_available = True
except ImportError:
    print('XGBoost not installed. Skipping...')
    xgb_available = False

## Model Comparison

In [None]:
# Compile all results
results = {
    'Model': [
        'Bagging (DT)',
        'Bagging (Regressor)',
        'Random Forest',
        'RF Regressor',
        'AdaBoost',
        'Gradient Boosting'
    ],
    'CV F1': [
        bagging_dt.best_score_,
        None,
        rf_grid.best_score_,
        None,
        ada_grid.best_score_,
        gb_grid.best_score_
    ],
    'Test Accuracy': [
        accuracy_score(y_test, bagging_dt_pred),
        accuracy_score(y_test, bagging_reg_pred),
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, rf_reg_pred),
        accuracy_score(y_test, ada_pred),
        accuracy_score(y_test, gb_pred)
    ],
    'Test F1': [
        f1_score(y_test, bagging_dt_pred, average='weighted'),
        f1_score(y_test, bagging_reg_pred, average='weighted'),
        f1_score(y_test, rf_pred, average='weighted'),
        f1_score(y_test, rf_reg_pred, average='weighted'),
        f1_score(y_test, ada_pred, average='weighted'),
        f1_score(y_test, gb_pred, average='weighted')
    ]
}

# Add XGBoost if available
if xgb_available:
    results['Model'].append('XGBoost')
    results['CV F1'].append(xgb_grid.best_score_)
    results['Test Accuracy'].append(accuracy_score(y_test, xgb_pred))
    results['Test F1'].append(f1_score(y_test, xgb_pred, average='weighted'))

results_df = pd.DataFrame(results)

print('='*70)
print('HYPERPARAMETER TUNING RESULTS')
print('='*70)
print(results_df.to_string(index=False))
print('='*70)

# Find best model
best_idx = results_df['Test F1'].idxmax()
print(f"\nBest Model: {results_df.loc[best_idx, 'Model']}")
print(f"Test F1 Score: {results_df.loc[best_idx, 'Test F1']:.4f}")

In [None]:
# Detailed classification report for best models
print('='*70)
print('CLASSIFICATION REPORTS')
print('='*70)

print('\n--- Random Forest ---')
print(classification_report(y_test, rf_pred, target_names=le.classes_))

print('\n--- Gradient Boosting ---')
print(classification_report(y_test, gb_pred, target_names=le.classes_))

if xgb_available:
    print('\n--- XGBoost ---')
    print(classification_report(y_test, xgb_pred, target_names=le.classes_))

## Save Models and Results

In [None]:
# Save all tuned models
joblib.dump(bagging_dt.best_estimator_, 'data/bagging_dt.joblib')
joblib.dump(bagging_reg.best_estimator_, 'data/bagging_reg.joblib')
joblib.dump(rf_grid.best_estimator_, 'data/rf_classifier.joblib')
joblib.dump(rf_reg_grid.best_estimator_, 'data/rf_regressor.joblib')
joblib.dump(ada_grid.best_estimator_, 'data/adaboost.joblib')
joblib.dump(gb_grid.best_estimator_, 'data/gradient_boost.joblib')

if xgb_available:
    joblib.dump(xgb_grid.best_estimator_, 'data/xgboost.joblib')

# Save results
results_df.to_csv('data/hyperparameter_results.csv', index=False)

# Save predictions for visualization
predictions = {
    'y_test': y_test,
    'bagging_dt': bagging_dt_pred,
    'bagging_reg': bagging_reg_pred,
    'rf': rf_pred,
    'rf_reg': rf_reg_pred,
    'adaboost': ada_pred,
    'gradient_boost': gb_pred
}

if xgb_available:
    predictions['xgboost'] = xgb_pred

np.savez('data/predictions.npz', **predictions)

print('All models and results saved successfully!')
print('\nSaved files:')
print('  - data/bagging_dt.joblib, bagging_reg.joblib')
print('  - data/rf_classifier.joblib, rf_regressor.joblib')
print('  - data/adaboost.joblib, gradient_boost.joblib')
if xgb_available:
    print('  - data/xgboost.joblib')
print('  - data/hyperparameter_results.csv')
print('  - data/predictions.npz')