## Model Optimization and Refinement

### Introduction

In this notebook, we focus on optimizing and refining the models developed in previous steps. While the initial models provided a good baseline, further fine-tuning is required to improve their performance and ensure they generalize well to new data.

### Objectives:
- **Hyperparameter Tuning**: We will explore different sets of hyperparameters using methods such as grid search or random search to identify the best configurations for each model.
- **Cross-Validation**: To ensure the models are robust and not overfitting, we will use cross-validation to evaluate their performance on multiple data subsets.
- **Model Comparison**: After optimization, we will compare the models based on key metrics such as accuracy, precision, recall, F1-score, and more, ensuring we select the best-performing model.

In [16]:
# Imports 
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.exceptions import ConvergenceWarning
import warnings
from tabulate import tabulate

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [17]:
# Load the pre-split data
X_train = joblib.load('../outputs/X_train_encoded.joblib')
X_test = joblib.load('../outputs/X_test_encoded.joblib')
y_train = joblib.load('../outputs/y_train.joblib')
y_test = joblib.load('../outputs/y_test.joblib')

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    return mse, r2, mae

# Initialize results tracking
results = []

In [18]:
# Load and evaluate baseline models
print("Loading and evaluating baseline models...")
baseline_models = {
    'RandomForest': joblib.load('../models/random_forest_model.joblib'),
    'XGBoost': joblib.load('../models/xgboost_model.joblib')
}

for name, model in baseline_models.items():
    train_mse, train_r2, train_mae = evaluate_model(model, X_train, y_train)
    test_mse, test_r2, test_mae = evaluate_model(model, X_test, y_test)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_mse = -cv_scores.mean()
    
    results.append({
        'Model': name,
        'Iteration': 'Baseline',
        'Train MSE': train_mse,
        'Train R2': train_r2,
        'Train MAE': train_mae,
        'Test MSE': test_mse,
        'Test R2': test_r2,
        'Test MAE': test_mae,
        'CV MSE': cv_mse
    })
    print(f"Baseline {name} - Test MSE: {test_mse:.4f}, Test R2: {test_r2:.4f}, Test MAE: {test_mae:.4f}, CV MSE: {cv_mse:.4f}")

Loading and evaluating baseline models...
Baseline RandomForest - Test MSE: 0.0018, Test R2: 0.9003, Test MAE: 0.0314, CV MSE: 0.0016
Baseline XGBoost - Test MSE: 0.0021, Test R2: 0.8804, Test MAE: 0.0344, CV MSE: 0.0018


In [19]:
# Define optimization stages
optimization_stages = [
    {
        'name': 'Basic Tuning',
        'RandomForest': {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10]
        },
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 0.3]
        }
    },
    {
        'name': 'Advanced Tuning',
        'RandomForest': {
            'n_estimators': [300, 400, 500],
            'max_depth': [20, 30, 40, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        },
        'XGBoost': {
            'n_estimators': [300, 400, 500],
            'max_depth': [4, 5, 6, 7],
            'learning_rate': [0.01, 0.05, 0.1],
            'subsample': [0.8, 0.9, 1.0],
            'colsample_bytree': [0.8, 0.9, 1.0]
        }
    }
]

In [20]:
# Perform iterative optimization
optimized_models = baseline_models.copy()
for stage in optimization_stages:
    print(f"\nPerforming {stage['name']}...")
    for name, model in optimized_models.items():
        print(f"Optimizing {name}...")
        random_search = RandomizedSearchCV(
            model, stage[name], n_iter=20, cv=5, n_jobs=-1, 
            random_state=42, scoring='neg_mean_squared_error'
        )
        random_search.fit(X_train, y_train)
        optimized_models[name] = random_search.best_estimator_
        
        train_mse, train_r2, train_mae = evaluate_model(optimized_models[name], X_train, y_train)
        test_mse, test_r2, test_mae = evaluate_model(optimized_models[name], X_test, y_test)
        
        # Perform cross-validation
        cv_scores = cross_val_score(optimized_models[name], X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        cv_mse = -cv_scores.mean()
        
        results.append({
            'Model': name,
            'Iteration': stage['name'],
            'Train MSE': train_mse,
            'Train R2': train_r2,
            'Train MAE': train_mae,
            'Test MSE': test_mse,
            'Test R2': test_r2,
            'Test MAE': test_mae,
            'CV MSE': cv_mse
        })


Performing Basic Tuning...
Optimizing RandomForest...
Optimizing XGBoost...

Performing Advanced Tuning...
Optimizing RandomForest...
Optimizing XGBoost...


In [21]:
# Create and display results table
results_df = pd.DataFrame(results)
print("\nModel Optimization Results:")
print(tabulate(results_df, headers='keys', tablefmt='pretty', floatfmt='.4f'))


Model Optimization Results:
+---+--------------+-----------------+------------------------+--------------------+-----------------------+-----------------------+--------------------+----------------------+-----------------------+
|   |    Model     |    Iteration    |       Train MSE        |      Train R2      |       Train MAE       |       Test MSE        |      Test R2       |       Test MAE       |        CV MSE         |
+---+--------------+-----------------+------------------------+--------------------+-----------------------+-----------------------+--------------------+----------------------+-----------------------+
| 0 | RandomForest |    Baseline     | 0.00021605887022184737 | 0.9882479728654439 | 0.010577342047930252  | 0.0017650239598568358 | 0.9003449235890976 | 0.03137962401455429  | 0.0015982510600238017 |
| 1 |   XGBoost    |    Baseline     | 1.949556903478577e-06  | 0.9998939583197556 | 0.0009426877035487339 | 0.0021176921084175805 | 0.8804329155417036 | 0.03435349511

In [22]:
# Display final model performance and comparison
print("\nModel Performance Comparison:")
for name in baseline_models.keys():
    baseline_results = results_df[(results_df['Model'] == name) & (results_df['Iteration'] == 'Baseline')].iloc[0]
    optimized_results = results_df[(results_df['Model'] == name) & (results_df['Iteration'] == 'Advanced Tuning')].iloc[0]
    
    print(f"\n{name}:")
    print(f"  Baseline  - MSE: {baseline_results['Test MSE']:.4f}, R2: {baseline_results['Test R2']:.4f}, MAE: {baseline_results['Test MAE']:.4f}, CV MSE: {baseline_results['CV MSE']:.4f}")
    print(f"  Optimized - MSE: {optimized_results['Test MSE']:.4f}, R2: {optimized_results['Test R2']:.4f}, MAE: {optimized_results['Test MAE']:.4f}, CV MSE: {optimized_results['CV MSE']:.4f}")
    
    mse_improvement = (baseline_results['Test MSE'] - optimized_results['Test MSE']) / baseline_results['Test MSE'] * 100
    r2_improvement = (optimized_results['Test R2'] - baseline_results['Test R2']) / baseline_results['Test R2'] * 100
    mae_improvement = (baseline_results['Test MAE'] - optimized_results['Test MAE']) / baseline_results['Test MAE'] * 100
    cv_mse_improvement = (baseline_results['CV MSE'] - optimized_results['CV MSE']) / baseline_results['CV MSE'] * 100
    
    print(f"  Improvement - MSE: {mse_improvement:.2f}%, R2: {r2_improvement:.2f}%, MAE: {mae_improvement:.2f}%, CV MSE: {cv_mse_improvement:.2f}%")



Model Performance Comparison:

RandomForest:
  Baseline  - MSE: 0.0018, R2: 0.9003, MAE: 0.0314, CV MSE: 0.0016
  Optimized - MSE: 0.0018, R2: 0.8962, MAE: 0.0321, CV MSE: 0.0016
  Improvement - MSE: -4.13%, R2: -0.46%, MAE: -2.29%, CV MSE: 2.00%

XGBoost:
  Baseline  - MSE: 0.0021, R2: 0.8804, MAE: 0.0344, CV MSE: 0.0018
  Optimized - MSE: 0.0017, R2: 0.9025, MAE: 0.0302, CV MSE: 0.0016
  Improvement - MSE: 18.45%, R2: 2.51%, MAE: 12.18%, CV MSE: 11.98%


In [23]:
# Save results to CSV
results_df.to_csv('../outputs/first_model_optimization_results.csv', index=False)
print("\nResults saved to 'first_model_optimization_results.csv'")


Results saved to 'first_model_optimization_results.csv'


In [24]:
# Determine the best model
best_model = results_df.loc[results_df['Test R2'].idxmax()]
print(f"\nBest Model: {best_model['Model']} (Iteration: {best_model['Iteration']})")
print(f"Best Model Performance:")
print(f"  MSE: {best_model['Test MSE']:.4f}")
print(f"  R2: {best_model['Test R2']:.4f}")
print(f"  MAE: {best_model['Test MAE']:.4f}")
print(f"  CV MSE: {best_model['CV MSE']:.4f}")


Best Model: XGBoost (Iteration: Advanced Tuning)
Best Model Performance:
  MSE: 0.0017
  R2: 0.9025
  MAE: 0.0302
  CV MSE: 0.0016


In [25]:
# Save final optimized models
for name, model in optimized_models.items():
    joblib.dump(model, f'../models/optimized_{name.lower()}_model.pkl')