# Modeling

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import shap

ModuleNotFoundError: No module named 'xgboost'

In [3]:
# Load data
df = pd.read_csv('../data/MachineLearningRating_v3.txt', delimiter='|')

# Target 1: Claim severity → filter only those with a claim
df_claimed = df[df['TotalClaims'] > 0].copy()

# Feature: Add HasClaim and Margin if not added before
df['HasClaim'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']


  df = pd.read_csv('../data/MachineLearningRating_v3.txt', delimiter='|')


In [None]:
features = [
    'Gender', 'Province', 'VehicleType', 'RegistrationYear',
    'CustomValueEstimate', 'SumInsured', 'CalculatedPremiumPerTerm'
]
target = 'TotalClaims'


In [None]:
df_claimed = df_claimed[features + [target]].dropna()


In [None]:
df_encoded = pd.get_dummies(df_claimed, columns=['Gender', 'Province', 'VehicleType'], drop_first=True)


In [None]:
X = df_encoded.drop(target, axis=1)
y = df_encoded[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
lr = LinearRegression()
lr.fit(X_train, X_train)
y_pred_lr = lr.predict(X_test)


In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [None]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)


In [None]:
def evaluate(y_true, y_pred, name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} - RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate(y_test, y_pred_lr, "Linear Regression")
evaluate(y_test, y_pred_rf, "Random Forest")
evaluate(y_test, y_pred_xgb, "XGBoost")


In [None]:
explainer = shap.Explainer(xgb)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test)
