In [13]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import psutil
import platform

# Load data
df_train = pd.read_csv('/content/df_train_part2_20250212.csv').drop(columns=['Unnamed: 0'])

# Define target variables
df_train['PurePremium'] = df_train['ClaimAmount'] / df_train['Exposure']
df_train['Frequency'] = df_train['ClaimNb'] / df_train['Exposure']
df_severity = df_train[df_train['ClaimAmount'] > 0].copy()
df_severity['Severity'] = df_severity['ClaimAmount'] / df_severity['ClaimNb']

# Split data into features and targets
categorical_features = ['Power', 'Brand', 'Gas', 'Region']
numerical_features = ['CarAge', 'DriverAge', 'Density']

# Frequency data
X = df_train.drop(columns=['ClaimAmount', 'PurePremium', 'Frequency'])
y_freq = df_train['Frequency']
weights_freq = df_train['Exposure']

# Severity data
X_sev = df_severity.drop(columns=['ClaimAmount', 'PurePremium', 'Frequency', 'Severity'])
y_sev = df_severity['Severity']
weights_sev = df_severity['ClaimNb']

# Split into train/test sets (frequency)
X_train_freq, X_test_freq, y_train_freq, y_test_freq, w_train_freq, w_test_freq = train_test_split(
    X, y_freq, weights_freq, test_size=0.2, random_state=42
)

# Split into train/test sets (severity)
X_train_sev, X_test_sev, y_train_sev, y_test_sev, w_train_sev, w_test_sev = train_test_split(
    X_sev, y_sev, weights_sev, test_size=0.2, random_state=42
)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

**GLM (POISSON+GAMMA)**

In [28]:
from sklearn.linear_model import PoissonRegressor, GammaRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import time

# Define the preprocessor
categorical_features = ["Power", "Brand", "Gas", "Region"]
numerical_features = ["CarAge", "DriverAge", "Density"]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Frequency: Poisson GLM
start = time.time()
model_freq = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', PoissonRegressor(max_iter=1000))
])
model_freq.fit(X_train_freq, y_train_freq, regressor__sample_weight=w_train_freq)
train_time_freq = time.time() - start

# Severity: Gamma GLM
start = time.time()
model_sev = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GammaRegressor(max_iter=1000))
])
model_sev.fit(X_train_sev, y_train_sev, regressor__sample_weight=w_train_sev)
train_time_sev = time.time() - start

# Ensure w_test_freq is aligned with the samples used for prediction
common_index = X_test_freq.index.intersection(X_test_sev.index)

# Predict on the test set
y_pred_freq_common = model_freq.predict(X_test_freq.loc[common_index])
y_pred_sev_common = model_sev.predict(X_test_sev.loc[common_index])

# Calculate the combined predictions for Pure Premium
y_pred_pure_premium = y_pred_freq_common * y_pred_sev_common
y_true_pure_premium = y_test_freq.loc[common_index] * y_test_sev.loc[common_index]


# Evaluation
r2 = r2_score(y_true_pure_premium, y_pred_pure_premium)
mse = mean_squared_error(y_true_pure_premium, y_pred_pure_premium)
mae = mean_absolute_error(y_true_pure_premium, y_pred_pure_premium)


print(f"GLM (Poisson + Gamma): R²={r2:.3f}, MSE={mse:.1f}, MAE={mae:.1f}")
print(f"Training Time: {train_time_freq + train_time_sev:.1f}s")

GLM (Poisson + Gamma): R²=-0.152, MSE=159769529.0, MAE=4604.2
Training Time: 0.3s


**RANDOM FOREST**

In [24]:
from sklearn.ensemble import RandomForestRegressor

# Frequency: Random Forest
start = time.time()
model_freq_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])
model_freq_rf.fit(X_train_freq, y_train_freq, regressor__sample_weight=w_train_freq)
train_time_freq_rf = time.time() - start

# Severity: Random Forest
start = time.time()
model_sev_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])
model_sev_rf.fit(X_train_sev, y_train_sev, regressor__sample_weight=w_train_sev)
train_time_sev_rf = time.time() - start

# Predictions
y_pred_freq_rf = model_freq_rf.predict(X_test_freq.loc[common_index])
y_pred_sev_rf = model_sev_rf.predict(X_test_sev.loc[common_index])

# Compute Pure Premium
y_pred_pure_premium_rf = y_pred_freq_rf * y_pred_sev_rf

# Evaluate Random Forest Models
r2_rf = r2_score(y_true_pure_premium, y_pred_pure_premium_rf)
mse_rf = mean_squared_error(y_true_pure_premium, y_pred_pure_premium_rf)
mae_rf = mean_absolute_error(y_true_pure_premium, y_pred_pure_premium_rf)

print(f"Random Forest: R²={r2_rf:.3f}, MSE={mse_rf:.1f}, MAE={mae_rf:.1f}")
print(f"Training Time (Random Forest): {train_time_freq_rf + train_time_sev_rf:.1f}s")

Random Forest: R²=-0.151, MSE=159733754.4, MAE=4633.5
Training Time (Random Forest): 613.6s


**XGBOOST**

In [43]:
!pip install --upgrade scikit-learn
!pip install --upgrade xgboost



In [50]:
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# One-Hot Encode categorical features
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_freq_encoded = encoder.fit_transform(X_train_freq[categorical_features])
X_test_freq_encoded = encoder.transform(X_test_freq[categorical_features])
X_train_sev_encoded = encoder.transform(X_train_sev[categorical_features])  # Match encoding
X_test_sev_encoded = encoder.transform(X_test_sev[categorical_features])  # Match encoding

# Standardize numerical features
scaler = StandardScaler()
X_train_freq_scaled = scaler.fit_transform(X_train_freq[numerical_features])
X_test_freq_scaled = scaler.transform(X_test_freq[numerical_features])
X_train_sev_scaled = scaler.transform(X_train_sev[numerical_features])  # Match scaling
X_test_sev_scaled = scaler.transform(X_test_sev[numerical_features])  # Match scaling

# Ensure training and test sets have the same number of records
X_train_freq_final = np.hstack([X_train_freq_scaled, X_train_freq_encoded])
X_test_freq_final = np.hstack([X_test_freq_scaled, X_test_freq_encoded])
X_train_sev_final = np.hstack([X_train_sev_scaled, X_train_sev_encoded])
X_test_sev_final = np.hstack([X_test_sev_scaled, X_test_sev_encoded])

# Convert indices into NumPy boolean masks
test_freq_mask = X_test_freq.index.isin(X_test_sev.index)
test_sev_mask = X_test_sev.index.isin(X_test_freq.index)

# Filter test sets using boolean masks
X_test_freq_final = X_test_freq_final[test_freq_mask]
X_test_sev_final = X_test_sev_final[test_sev_mask]
y_test_freq_aligned = y_test_freq[test_freq_mask]
y_test_sev_aligned = y_test_sev[test_sev_mask]

# XGBoost Frequency Model
start = time.time()
model_freq_xgb = XGBRegressor(objective="count:poisson", random_state=42)
model_freq_xgb.fit(X_train_freq_final, y_train_freq, sample_weight=w_train_freq)
train_time_freq = time.time() - start

# XGBoost Severity Model
start = time.time()
model_sev_xgb = XGBRegressor(objective="reg:gamma", random_state=42)
model_sev_xgb.fit(X_train_sev_final, y_train_sev, sample_weight=w_train_sev)
train_time_sev = time.time() - start

# Predictions (on aligned test set)
y_pred_freq_xgb = model_freq_xgb.predict(X_test_freq_final)
y_pred_sev_xgb = model_sev_xgb.predict(X_test_sev_final)

# Compute Pure Premium
y_pred_pure_premium_xgb = y_pred_freq_xgb * y_pred_sev_xgb
y_true_pure_premium = y_test_freq_aligned * y_test_sev_aligned

# Evaluate XGBoost Model
results = {
    "Model": "XGBoost",
    "R2": r2_score(y_true_pure_premium, y_pred_pure_premium_xgb),
    "MSE": mean_squared_error(y_true_pure_premium, y_pred_pure_premium_xgb),
    "MAE": mean_absolute_error(y_true_pure_premium, y_pred_pure_premium_xgb)
}

# Print results
print(f"XGBoost: R²={results['R2']:.3f}, MSE={results['MSE']:.1f}, MAE={results['MAE']:.1f}")
print(f"Training Time (XGBoost): {train_time_freq + train_time_sev:.1f}s")


XGBoost: R²=-0.153, MSE=159993081.3, MAE=4632.7
Training Time (XGBoost): 6.2s


**LIGHT GBM**

In [75]:
import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMRegressor
import time
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Frequency Model: LightGBM
start = time.time()
model_freq_lgb = LGBMRegressor(
    objective='poisson',
    verbose=-1  )
model_freq_lgb.fit(X_train_freq, y_train_freq, sample_weight=w_train_freq)
train_time_freq_lgb = time.time() - start

# Severity Model: LightGBM
start = time.time()
model_sev_lgb = LGBMRegressor(
    objective='gamma',
    verbose=-1
)
model_sev_lgb.fit(X_train_sev, y_train_sev, sample_weight=w_train_sev)
train_time_sev_lgb = time.time() - start

# Predictions
y_pred_freq_lgb = model_freq_lgb.predict(X_test_freq.loc[common_index])
y_pred_sev_lgb = model_sev_lgb.predict(X_test_sev.loc[common_index])

# Compute Pure Premium
y_pred_pure_premium_lgb = y_pred_freq_lgb * y_pred_sev_lgb

# Evaluate LightGBM Models
r2_lgb = r2_score(y_true_pure_premium, y_pred_pure_premium_lgb)
mse_lgb = mean_squared_error(y_true_pure_premium, y_pred_pure_premium_lgb)
mae_lgb = mean_absolute_error(y_true_pure_premium, y_pred_pure_premium_lgb)

print(f"LightGBM: R²={r2_lgb:.3f}, MSE={mse_lgb:.1f}, MAE={mae_lgb:.1f}")
print(f"Training Time (LightGBM): {train_time_freq_lgb + train_time_sev_lgb:.1f}s")


LightGBM: R²=-198643848652276992.000, MSE=27560791199832655557296128.0, MAE=330054576665.8
Training Time (LightGBM): 2.4s


**GRADIENT BOOST**

In [77]:
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import time
### ----> Gradient Boosting Model: Claim Frequency & Severity <---- ###

# Frequency: Gradient Boosting
start = time.time()
model_freq_gbm = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])
model_freq_gbm.fit(X_train_freq, y_train_freq, regressor__sample_weight=w_train_freq)
train_time_freq_gbm = time.time() - start

# Severity: Gradient Boosting
start = time.time()
model_sev_gbm = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])
model_sev_gbm.fit(X_train_sev, y_train_sev, regressor__sample_weight=w_train_sev)
train_time_sev_gbm = time.time() - start

# Predictions
y_pred_freq_gbm = model_freq_gbm.predict(X_test_freq.loc[common_index])
y_pred_sev_gbm = model_sev_gbm.predict(X_test_sev.loc[common_index])

# Compute Pure Premium
y_pred_pure_premium_gbm = y_pred_freq_gbm * y_pred_sev_gbm

# Evaluate Gradient Boosting Models
r2_gbm = r2_score(y_true_pure_premium, y_pred_pure_premium_gbm)
mse_gbm = mean_squared_error(y_true_pure_premium, y_pred_pure_premium_gbm)
mae_gbm = mean_absolute_error(y_true_pure_premium, y_pred_pure_premium_gbm)

print(f"Gradient Boosting: R²={r2_gbm:.3f}, MSE={mse_gbm:.1f}, MAE={mae_gbm:.1f}")
print(f"Training Time (Gradient Boosting): {train_time_freq_gbm + train_time_sev_gbm:.1f}s")



Gradient Boosting: R²=-0.152, MSE=159865994.0, MAE=4607.5
Training Time (Gradient Boosting): 52.8s


**CATBOOST**

In [54]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [56]:
### ----> CatBoost Models: Claim Frequency & Severity <---- ###

# Frequency: CatBoost
start = time.time()
model_freq_cat = CatBoostRegressor(loss_function='Poisson', cat_features=categorical_features, verbose=0)
model_freq_cat.fit(X_train_freq, y_train_freq, sample_weight=w_train_freq)
train_time_freq_cat = time.time() - start

# Severity: CatBoost
start = time.time()
model_sev_cat = CatBoostRegressor(loss_function='RMSE', cat_features=categorical_features, verbose=0)
model_sev_cat.fit(X_train_sev, y_train_sev, sample_weight=w_train_sev)
train_time_sev_cat = time.time() - start

# Predictions
y_pred_freq_cat = model_freq_cat.predict(X_test_freq.loc[common_index])
y_pred_sev_cat = model_sev_cat.predict(X_test_sev.loc[common_index])

# Compute Pure Premium
y_pred_pure_premium_cat = y_pred_freq_cat * y_pred_sev_cat

# Evaluate CatBoost Models
r2_cat = r2_score(y_true_pure_premium, y_pred_pure_premium_cat)
mse_cat = mean_squared_error(y_true_pure_premium, y_pred_pure_premium_cat)
mae_cat = mean_absolute_error(y_true_pure_premium, y_pred_pure_premium_cat)

print(f"CatBoost: R²={r2_cat:.3f}, MSE={mse_cat:.1f}, MAE={mae_cat:.1f}")
print(f"Training Time (CatBoost): {train_time_freq_cat + train_time_sev_cat:.1f}s")



CatBoost: R²=-12.255, MSE=1839019749.4, MAE=9589.2
Training Time (CatBoost): 193.7s


**BAYSEIAN RIDGE**

In [58]:
### ----> Bayesian Ridge Model: Claim Frequency & Severity <---- ###
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features) # Updated here
    ])
# Frequency: Bayesian Ridge Regression
start = time.time()
model_freq_bayes = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', BayesianRidge())
])
model_freq_bayes.fit(X_train_freq, y_train_freq, regressor__sample_weight=w_train_freq)
train_time_freq_bayes = time.time() - start

# Severity: Bayesian Ridge Regression
start = time.time()
model_sev_bayes = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', BayesianRidge())
])
model_sev_bayes.fit(X_train_sev, y_train_sev, regressor__sample_weight=w_train_sev)
train_time_sev_bayes = time.time() - start

# Predictions
y_pred_freq_bayes = model_freq_bayes.predict(X_test_freq.loc[common_index])
y_pred_sev_bayes = model_sev_bayes.predict(X_test_sev.loc[common_index])

# Compute Pure Premium
y_pred_pure_premium_bayes = y_pred_freq_bayes * y_pred_sev_bayes

# Evaluate Bayesian Ridge Models
r2_bayes = r2_score(y_true_pure_premium, y_pred_pure_premium_bayes)
mse_bayes = mean_squared_error(y_true_pure_premium, y_pred_pure_premium_bayes)
mae_bayes = mean_absolute_error(y_true_pure_premium, y_pred_pure_premium_bayes)

print(f"Bayesian Ridge: R²={r2_bayes:.3f}, MSE={mse_bayes:.1f}, MAE={mae_bayes:.1f}")
print(f"Training Time (Bayesian Ridge): {train_time_freq_bayes + train_time_sev_bayes:.1f}s")


Bayesian Ridge: R²=-0.151, MSE=159664878.8, MAE=4589.5
Training Time (Bayesian Ridge): 2.2s


**RESULTS**

In [73]:
import pandas as pd
from tabulate import tabulate

# Create a DataFrame with all model results
results_df = pd.DataFrame({
    "Model": ["GLM (Poisson + Gamma)", "XGBoost", "LightGBM", "Gradient Boosting", "CatBoost", "Bayesian Ridge"],
    "R²": [r2, results['R2'], r2_lgb, r2_gbm, r2_cat, r2_bayes],
    "MSE": [mse, results['MSE'], mse_lgb, mse_gbm, mse_cat, mse_bayes],
    "MAE": [mae, results['MAE'], mae_lgb, mae_gbm, mae_cat, mae_bayes],
    "Training Time (s)": [
        train_time_freq + train_time_sev,
        train_time_freq_xgb + train_time_sev_xgb,
        train_time_freq_lgb + train_time_sev_lgb,
        train_time_freq_gbm + train_time_sev_gbm,
        train_time_freq_cat + train_time_sev_cat,
        train_time_freq_bayes + train_time_sev_bayes
    ]
})

# Print the results in a well-formatted table
print(tabulate(results_df, headers="keys", tablefmt="grid", floatfmt=".3f"))


+----+-----------------------+-------------------------+--------------------------------+------------------+---------------------+
|    | Model                 |                      R² |                            MSE |              MAE |   Training Time (s) |
|  0 | GLM (Poisson + Gamma) |                  -0.152 |                  159769528.995 |         4604.159 |               6.184 |
+----+-----------------------+-------------------------+--------------------------------+------------------+---------------------+
|  1 | XGBoost               |                  -0.153 |                  159993081.298 |         4632.716 |               7.246 |
+----+-----------------------+-------------------------+--------------------------------+------------------+---------------------+
|  2 | LightGBM              | -198643848652276992.000 | 27560791199832655557296128.000 | 330054576665.769 |               2.123 |
+----+-----------------------+-------------------------+---------------------------