# 06 â€” Train Models (Per Make & Body Type)

In this notebook we train a **separate model per (Make, Body_Type) group**.

**Modelling approach:**
- Time-based split:
  - 80% Train
  - 10% Validation
  - 10% Test
- Features:
  - Year
  - Month
  - Month_Since_Start
  - Make (one-hot)
  - Body_Type (one-hot)
- Model:
  - `GradientBoostingRegressor`
  - Small hyperparameter grid, chosen by validation MAE
- Outputs:
  - `final_predictions_detailed.csv` (row-level actual vs predicted by split)
  - `model_scoring_summary.csv` (MAE / RMSE per segment)


In [None]:
import os
import math
import warnings
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings("ignore")

DATA_DIR = "data"
PRED_PATH = os.path.join(DATA_DIR, "final_predictions_detailed.csv")
METRICS_PATH = os.path.join(DATA_DIR, "model_scoring_summary.csv")

df = pd.read_csv(os.path.join(DATA_DIR, "clean_sales_with_features.csv"))
df["Year_Month"] = pd.to_datetime(df["Year_Month"])

In [None]:
# Build group dictionary
groups = {}
for (make, body), grp in df.groupby(["Make", "Body_Type"]):
    if grp["Year_Month"].nunique() < 16:
        continue
    groups[(make, body)] = grp.sort_values("Year_Month").reset_index(drop=True)

print(f"Total groups for modelling: {len(groups)}")


In [None]:
predictions_list = []
metrics = []

for (make, body), grp in groups.items():
    months = grp["Year_Month"].unique()
    n = len(months)
    train_end = int(n * 0.8)
    val_end = int(n * 0.9)

    train_m = months[:train_end]
    val_m   = months[train_end:val_end]
    test_m  = months[val_end:]

    train_df = grp[grp["Year_Month"].isin(train_m)].copy()
    val_df   = grp[grp["Year_Month"].isin(val_m)].copy()
    test_df  = grp[grp["Year_Month"].isin(test_m)].copy()

    if len(train_df) == 0 or len(val_df) == 0 or len(test_df) == 0:
        continue

    feature_cols = ["Year", "Month", "Month_Since_Start", "Make", "Body_Type"]
    target_col = "Units_Sold"

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["Make", "Body_Type"]),
            ("num", "passthrough", ["Year", "Month", "Month_Since_Start"])
        ]
    )

    param_grid = [
        {"n_estimators": 100, "learning_rate": 0.1,  "max_depth": 2},
        {"n_estimators": 200, "learning_rate": 0.05, "max_depth": 3},
        {"n_estimators": 300, "learning_rate": 0.05, "max_depth": 4},
        {"n_estimators": 400, "learning_rate": 0.03, "max_depth": 3},
    ]

    best_model = None
    best_mae = np.inf

    X_train = train_df[feature_cols]
    y_train = train_df[target_col]
    X_val   = val_df[feature_cols]
    y_val   = val_df[target_col]

    for params in param_grid:
        model = GradientBoostingRegressor(**params)
        pipeline = Pipeline([("prep", preprocessor), ("model", model)])
        pipeline.fit(X_train, y_train)
        preds_val = pipeline.predict(X_val)
        mae = mean_absolute_error(y_val, preds_val)
        if mae < best_mae:
            best_mae = mae
            best_model = pipeline

    if best_model is None:
        continue

    # Save train/val/test predictions
    for split_name, split_df in [("Train", train_df), ("Validation", val_df), ("Test", test_df)]:
        temp = split_df.copy()
        temp["Split"] = split_name
        temp["Predicted_Units_Sold"] = best_model.predict(temp[feature_cols])
        temp["Make"] = make
        temp["Body_Type"] = body
        predictions_list.append(temp)

    # Metrics on validation + test combined
    vt = pd.concat([val_df, test_df])
    if len(vt) > 0:
        y_true = vt[target_col]
        y_pred = best_model.predict(vt[feature_cols])
        mae = mean_absolute_error(y_true, y_pred)
        rmse = math.sqrt(mean_squared_error(y_true, y_pred))
        metrics.append({
            "Make": make,
            "Body_Type": body,
            "MAE": mae,
            "RMSE": rmse,
            "N": len(vt)
        })

In [None]:
final_predictions = (
    pd.concat(predictions_list, ignore_index=True)
    .sort_values(["Make", "Body_Type", "Year_Month"])
    .reset_index(drop=True)
)

final_predictions.to_csv(PRED_PATH, index=False)
print(f"Saved detailed predictions to: {PRED_PATH}")

metrics_df = (
    pd.DataFrame(metrics)
    .sort_values("MAE")
    .reset_index(drop=True)
)
metrics_df.to_csv(METRICS_PATH, index=False)

print(f"Saved metrics to: {METRICS_PATH}")
display(metrics_df.head(10))