# 03 - Model Training

This notebook covers:
- Joining features and aggregating data
- Creating lag features
- Training classification and regression models
- Evaluating performance

In [None]:
# Merge datasets and create lags
df_finale = pd.merge(disp_by_codsistema, aggregated, on=['CODSISTEMA', 'YEAR'], how='right')
df_finale = df_finale.sort_values(['CODSISTEMA', 'YEAR', 'MONTH'])

for lag in range(1, 7):
    df_finale[f'dispersione_lag_{lag}'] = (
        df_finale.groupby('CODSISTEMA')['dispersione'].shift(lag)
    )

In [None]:
# Split data for training and testing
from sklearn.metrics import f1_score
import lightgbm as lgb
import numpy as np

train = df_finale[df_finale.YEAR != 2022]
test = df_finale[df_finale.YEAR == 2022]

# Binary classifier
X_train_binary = train.drop(columns='dispersione')
X_test_binary = test.drop(columns='dispersione')
y_train_binary = (train.dispersione > 0).astype(int)
y_test_binary = (test.dispersione > 0).astype(int)

In [None]:
# Train LightGBM classifier
clf = lgb.LGBMClassifier(verbosity=-1)
clf.fit(X_train_binary.fillna(0).values, y_train_binary)
preds_bin = clf.predict(X_test_binary.fillna(0).values)

print('F1 Score:', f1_score(y_test_binary, preds_bin, average='macro'))

In [None]:
# Regressor (only on non-zero targets)
train_reg = train[train.dispersione != 0]
test_reg = test[test.dispersione != 0]

X_train_reg = train_reg.drop(columns='dispersione')
X_test_reg = test_reg.drop(columns='dispersione')
y_train_reg = train_reg.dispersione
y_test_reg = test_reg.dispersione

reg = lgb.LGBMRegressor(verbosity=-1)
reg.fit(X_train_reg.fillna(0).values, y_train_reg)
preds_reg = reg.predict(X_test_reg.fillna(0).values)

In [None]:
# Evaluate regression performance
from sklearn.metrics import mean_absolute_error

baseline_mae = abs(y_test_reg - y_test_reg.mean()).mean()
actual_mae = mean_absolute_error(y_test_reg, preds_reg)

print('Baseline MAE:', baseline_mae)
print('Model MAE:', actual_mae)
print('Improvement ratio:', actual_mae / baseline_mae)