In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap

In [8]:
data = pd.read_csv('../data/processed/clean_dataset.csv')

In [9]:
data.shape

(1000098, 46)

In [None]:
data.isna().sum()

UnderwrittenCoverID         0
PolicyID                    0
TransactionMonth            0
IsVATRegistered             0
Citizenship                 0
LegalType                   0
Title                       0
Language                    0
Bank                        0
AccountType                 0
MaritalStatus               0
Gender                      0
Country                     0
Province                    0
PostalCode                  0
MainCrestaZone              0
SubCrestaZone               0
ItemType                    0
mmcode                      0
VehicleType                 0
RegistrationYear            0
make                        0
Model                       0
Cylinders                   0
cubiccapacity               0
kilowatts                   0
bodytype                    0
NumberOfDoors               0
VehicleIntroDate            0
AlarmImmobiliser            0
TrackingDevice              0
CapitalOutstanding          0
NewVehicle                  0
SumInsured

In [10]:
# Feature Engineering
data['Margin'] = data['TotalPremium'] - data['TotalClaims']

In [11]:
# Identify categorical features
categorical_cols = data.select_dtypes(include=['object']).columns

# Separate high-cardinality and low-cardinality categorical features
high_cardinality_cols = [col for col in categorical_cols if data[col].nunique() > 10]
low_cardinality_cols = list(set(categorical_cols) - set(high_cardinality_cols))

In [12]:
# Apply label encoding to high-cardinality columns
label_encoders = {col: LabelEncoder() for col in high_cardinality_cols}

for col in high_cardinality_cols:
    data[col] = label_encoders[col].fit_transform(data[col])

In [14]:
# Apply one-hot encoding only to low-cardinality columns
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(data[low_cardinality_cols])

# Convert one-hot encoded data into a DataFrame
low_card_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(low_cardinality_cols))

In [15]:
# Drop original low-cardinality columns and concatenate encoded data
data = data.drop(columns=low_cardinality_cols).reset_index(drop=True)
data = pd.concat([data, low_card_encoded_df], axis=1)
data.shape

(1000098, 89)

In [17]:
# Train-Test Split
X = data.drop(['TotalPremium', 'TotalClaims'], axis=1)
y = data['TotalPremium']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

In [20]:
# Random Forest
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [21]:
# XGBoost
xgb_model = XGBRegressor(random_state=42, max_depth=4, tree_method='gpu_hist', predictor='gpu_predictor', n_estimators=50, learning_rate=0.1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [22]:
def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Evaluation:")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"R² Score: {r2}")
    print("-" * 50)

In [23]:
evaluate_model(y_test, y_pred_linear, "Linear Regression")

Linear Regression Evaluation:
Mean Absolute Error: 57.07293289223882
Mean Squared Error: 15706.794959218325
R² Score: 0.4306693967044357
--------------------------------------------------


In [24]:
evaluate_model(y_test, y_pred_rf, "Random Forest")

Random Forest Evaluation:
Mean Absolute Error: 0.1666454994302051
Mean Squared Error: 407.703317915567
R² Score: 0.9852218115435293
--------------------------------------------------


In [25]:
evaluate_model(y_test, y_pred_xgb, "XGBoost")

XGBoost Evaluation:
Mean Absolute Error: 25997.875896813202
Mean Squared Error: 901164441.7616645
R² Score: -32663.875082969324
--------------------------------------------------
