# Stacking


In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [17]:
df = pd.read_csv('../data/claims_data.csv')

In [18]:
# transform target
df['log_loss'] = np.log1p(df['loss'])

In [19]:
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
if 'loss' in numerical_features: numerical_features.remove('loss')
if 'log_loss' in numerical_features: numerical_features.remove('log_loss')
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

X = df[numerical_features + categorical_features]
y = df['log_loss']

In [20]:
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# --- For final evaluation ---
y_test_original = np.expm1(y_test_log)

## Define Models
- Ridge
- GLM
- Lasso

### GLM - Minh's code here

### Ridge Regression - Tahia's code here
- Scale the numerical data
- transform the categorical data
- ridge model defined

In [None]:
# Preprocessing Pipeline
# create transformer for numerical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# create transformer for categorical data
catgeorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# now combine using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', catgeorical_transformer, categorical_cols)
])

In [None]:
# Ridge Regression
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

### Lasso Regression - Shemarie's code here

## Create and Fit Stacked Model

In [None]:
estimators = [
    ('glm', glm_pipeline), 
    ('ridge', ridge_model), 
    ('lasso', lasso_pipeline)
]

stacked_model = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression(), # The "manager" model
    cv=5
)
print("Fitting the stacked model...")
stacked_model.fit(X_train, y_train_log)
print("Done.")

## Evaluate Stacked Model

In [None]:
# --- 5. Evaluate the Stacked Model ---
print("\nStacked Model Results:")
log_predictions = stacked_model.predict(X_test)

# Log scale
mae_log = mean_absolute_error(y_test_log, log_predictions)
r2_log = r2_score(y_test_log, log_predictions)
print(f"R-squared (log scale): {r2_log:.4f}")
print(f"MAE (log scale): {mae_log:.4f}")

# Original scale
original_predictions = np.expm1(log_predictions)
mae = mean_absolute_error(y_test_original, original_predictions)
print(f"MAE (original scale): ${mae:,.2f}")