# Frequency & Severity GLMs for Auto Insurance

This notebook builds GLMs for claim **frequency** and **severity**, then combines
them into a pure premium estimate as required in the MA 326 project.  We use
driver, vehicle, territory, and claims-history factors as predictors.


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm

# Load pre-cleaned dataset
df = pd.read_csv("ma326-project/motor_cleaned.csv")

print("Loaded cleaned dataset:")
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'ma326-project/motor_cleaned.csv'

# Predictor Selection

In [None]:
predictors = [
    "Age", "Seniority", "Second_driver", "Distribution_channel",
    "Power", "Weight", "Value_vehicle", "Cylinder_capacity",
    "Type_fuel", "Area", "Type_risk",
    "N_claims_history", "R_Claims_history"
]

# Keep rows with no missing predictors
df_model = df[predictors + ["N_claims_year", "Severity"]].dropna()

# One-hot encode categorical vars
cat_cols = ["Type_fuel", "Area", "Type_risk"]
X_full = pd.get_dummies(df_model[predictors], columns=cat_cols, drop_first=True)

# Frequency (all policies)
y_freq = df_model["N_claims_year"]
X_freq = X_full

# Severity target (only for policies with claims)
mask_sev = df_model["Severity"].notna()
y_sev = df_model.loc[mask_sev, "Severity"]
X_sev = X_full.loc[mask_sev]


# Train/Test Split (40/60)

In [None]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_freq, y_freq, train_size=0.4, random_state=42
)

Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X_sev, y_sev, train_size=0.4, random_state=42
)


# Frequency Model

In [None]:
# Add constant
Xf_train_const = sm.add_constant(Xf_train)
Xf_test_const  = sm.add_constant(Xf_test)

print(Xf_train_const.dtypes)
print(yf_train.dtype)

# Convert to plain NumPy float arrays
Xf_train_np = Xf_train_const.to_numpy(dtype=float)
Xf_test_np  = Xf_test_const.to_numpy(dtype=float)
yf_train_np = yf_train.to_numpy(dtype=float)
yf_test_np  = yf_test.to_numpy(dtype=float)

# Fit Poisson GLM
freq_model = sm.GLM(yf_train_np, Xf_train_np, family=sm.families.Poisson())
freq_res   = freq_model.fit()
print(freq_res.summary())

overdispersion = freq_res.deviance / freq_res.df_resid
print("Overdispersion ratio (Poisson):", overdispersion)

# Predictions on test set
freq_pred_test = freq_res.predict(Xf_test_np)
print(freq_pred_test[:10])


# Severity Model

In [None]:
# Add constant
Xs_train_const = sm.add_constant(Xs_train)
Xs_test_const  = sm.add_constant(Xs_test)

print(Xs_train_const.dtypes)
print(ys_train.dtype)

# Convert to plain NumPy arrays (float)
Xs_train_np = Xs_train_const.to_numpy(dtype=float)
Xs_test_np  = Xs_test_const.to_numpy(dtype=float)
ys_train_np = ys_train.to_numpy(dtype=float)
ys_test_np  = ys_test.to_numpy(dtype=float)

# Gamma needs strictly positive y
mask_sev_pos = ys_train_np > 0
Xs_train_np_pos = Xs_train_np[mask_sev_pos]
ys_train_np_pos = ys_train_np[mask_sev_pos]

# Fit Gamma GLM with log link
sev_model = sm.GLM(
    ys_train_np_pos,
    Xs_train_np_pos,
    family=sm.families.Gamma(link=sm.families.links.log())
)
sev_res = sev_model.fit()
print(sev_res.summary())

sev_pred_test = sev_res.predict(Xs_test_np)
print(sev_pred_test[:10])

# Model Metrics

In [None]:
# Metrics (use the same test vectors used to train with NumPy)
print("Frequency MSE:", mean_squared_error(yf_test_np, freq_pred_test))
print("Frequency MAE:", mean_absolute_error(yf_test_np, freq_pred_test))

print("Severity MSE:", mean_squared_error(ys_test_np, sev_pred_test))
print("Severity MAE:", mean_absolute_error(ys_test_np, sev_pred_test))

# Pure premium prediction: frequency Ã— severity
pure_premium_pred = freq_pred_test * sev_pred_test

print("Pure premium predictions (first 10):")
print(pure_premium_pred[:10])

# Make as labeled Series
pure_premium_series = pd.Series(pure_premium_pred, name="PurePremium")
print(pure_premium_series.head())