# 02 — Model Training

## 1. Overview

This notebook brings together the full modeling workflow: training baseline models, tuning their hyperparameters, and selecting the final model for interpretation and reporting. After completing the data cleaning (Notebook 00) and exploratory data analysis (Notebook 01), we now have a clear understanding of the dataset’s structure and the key variables that influence the target. The next step is to translate these insights into predictive performance.

In this notebook, we will:

* Prepare the cleaned dataset for modeling (feature selection, encoding, scaling)

* Train baseline models to establish reference performance

* Perform hyperparameter tuning using systematic search strategies

* Compare tuned models to identify the best-performing approach

* Train the final model using the optimal hyperparameters

* Evaluate the final model on unseen data

* Analyze feature importance and interpretability where relevant

By combining model training, tuning, and final selection into a single workflow, this notebook provides a complete and transparent modeling pipeline. The final output of this notebook will be the fully trained model that will be used for interpretation, reporting, and any downstream analysis.

In [4]:
import sys
import os

# Determine the project root (one level above the notebook directory)
project_root = os.path.abspath("..")

# Add project root to Python path so modules in /src can be imported
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added:", project_root)

# Build data directory path
data_dir = os.path.join(project_root, "data")
print("Data directory:", data_dir)


Project root added: C:\Users\shari\OneDrive\MSc Data Science and Society\Thesis_DSS_2026
Data directory: C:\Users\shari\OneDrive\MSc Data Science and Society\Thesis_DSS_2026\data


## 2. Imports

In [6]:
import numpy as np 
import pandas as pd


from src.modeling import (
    evaluate_model,
    run_all_models_for_regime,
    run_median_baseline,
    run_ridge,
    run_rf,
    run_xgb,
    run_all_models_for_regime
)

from src.model_comparison import (
    build_cross_model_comparison,
    compare_models,
    run_pairwise_tests
)
# Config
from src.config import GLOBAL_CONFIG

# EDA
from src.eda import (
    inspect_regime,
    run_full_eda,
    list_sorted_correlations,
    target_summary
)

# Metrics
from src.metrics import (
    rmse,
    mae,
    r2,
    error_analysis,
    bootstrap_rmse_ci
)

# feature filtering 
from src.feature_filtering import (
    drop_high_missing_cols, 
    drop_constant_and_near_constant_cols,
    drop_multicollinear_cols
)

# Feature engineering
from src.data_preparation import (
    prepare_data,
    build_preprocessor
)

# feature importances 
from src.feature_importance import (
    get_feature_names,
    ridge_coefficients,
    rf_gini_importance,
    xgb_importance
)

# Transformations
from src.transforms import (
    detect_col_types,
    inverse_target_corrected, 
    LogTransformer, 
    inverse_target,
    transform_target
)

from src.feature_engineering import build_preprocessor
# Plotting
from src.plotting import (
    plot_error_distribution,
    plot_per_fold_rmse,
    plot_predicted_vs_actual
)


## 3. Load datasets

In [8]:
# Load datasets
df_clean = pd.read_parquet(os.path.join(data_dir, "clean", "df_clean.parquet"))

df_regime_a = pd.read_parquet(os.path.join(data_dir, "regimes", "regime_a.parquet"))
df_regime_b = pd.read_parquet(os.path.join(data_dir, "regimes", "regime_b.parquet"))
df_regime_c = pd.read_parquet(os.path.join(data_dir, "regimes", "regime_c.parquet"))



## 4. Model Training, Hyperparameter Tuning, Model Evauation

In [None]:

ALL_RESULTS = {}

# Dictionary of regime names and their corresponding datasets
regimes = {
    "Regime A": df_regime_a,
    "Regime B": df_regime_b,
    "Regime C": df_regime_c
}

# Execute full modeling workflow for each regime
for regime_name, df_regime in regimes.items():
    ALL_RESULTS[regime_name] = run_all_models_for_regime(
        regime_name=regime_name,
        df=df_regime,
        config=GLOBAL_CONFIG
    )


RUNNING FULL EXPERIMENT – Regime A
Dropped (high missing): ['006', '009', '031', '033', '036', '038', '039', '040', '042', '043', '044', '045', '046', '047', '049', '050', '051', '052', '053', '055', '056', '057', '059', '060', '061', '062', '063', '064', '066', '067', '068', '069', '070', '072', '073', '074', '075', '076', '077', '078', '079', '080', '081', '083', '084', '086', '087', '103', '104', '105', '106', '107', '108', '109', '110', '111', '113', '114', '115', '116', '117', '118', '119', '120', '124', '125', '128', '129', '130', '131', '133', '135', '137', '138', '139', '144', '308', '309', '310', '311', '312', '314', '315', '316', '318', '319', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '



Dropped (high missing): ['006', '009', '031', '033', '036', '038', '039', '040', '042', '043', '044', '045', '046', '047', '049', '050', '051', '052', '053', '055', '056', '057', '059', '060', '061', '062', '063', '064', '066', '067', '068', '069', '070', '072', '073', '074', '075', '076', '077', '078', '079', '080', '081', '083', '084', '086', '087', '103', '104', '105', '106', '107', '108', '109', '110', '111', '113', '114', '115', '116', '117', '118', '119', '120', '124', '125', '128', '129', '130', '131', '133', '135', '137', '138', '139', '144', '308', '309', '310', '311', '312', '314', '315', '316', '318', '319', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '3

## 3. Best Hyperparamets 

In [None]:
# Dispaly best parameters for each model in each regime

for regime_name, regime_data in ALL_RESULTS.items():
    print(f"\n===== {regime_name} – Best Hyperparameters =====")
    for model_name, model_res in regime_data["results"].items():
        print(f"{model_name}: {model_res['best_params_overall']}")

## 4. Model Comparison & Statistical Tests

In [None]:
for regime in ["Regime A", "Regime B", "Regime C"]:
    print(f"\n===== {regime} – Model Comparison =====")
    print(ALL_RESULTS[regime]["comparison_table"])

    print(f"\n===== {regime} – Statistical Tests =====")
    print(ALL_RESULTS[regime]["stats_table"])


In [None]:
for regime, res in ALL_RESULTS.items():
    print(f"\n=== {regime} ===")
    
    for model_name, model_res in res["results"].items():
        print(f"\n--- {model_name} ---")
        
        preds = pd.DataFrame({
            "y_true": model_res["y_test_true"],
            "y_pred": model_res["y_test_pred"]
        })
        
        display(preds.head())   # show first few rows


## 5. Save Results

In [None]:
import joblib
import os

os.makedirs("results", exist_ok=True)

joblib.dump(ALL_RESULTS, "results/all_results.joblib")
