**Write the names of all group members (max. 5 members)**:
- Ramon, Kaspar
- Cyrill, Stotz

When submitting your work, please follow closely the template below.

# Exercise 1 (Poisson GLM)

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing, model_selection, linear_model

RANDOM_STATE = 69   # for reproducibility

### Question 1.a

In [2]:
# Read csv file `freMTPL2freq.csv` (e.g. use pandas)
data_df = pd.read_csv('freMTPL2freq.csv', sep=';', decimal=',')
print("Column Names: ", data_df.columns)
print("Number of Rows: ", len(data_df))
assert len(data_df) == 678007

Column Names:  Index(['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas',
       'Density', 'Region', 'Exposure', 'ClaimNb'],
      dtype='object')
Number of Rows:  678007


In [10]:
# Pre-process features

def preprocess_features(df):
    """Feature Engineering"""
    df_processed = df.copy()
    
    df_processed['VehPower'] = np.log(df_processed['VehPower'])
    df_processed['DrivAge'] = np.log(df_processed['DrivAge'])
    df_processed['BonusMalus'] = np.log(df_processed['BonusMalus'])
    df_processed['Density'] = np.log(df_processed['Density'])
    
    # Convert VehAge to categorical
    def categorize_veh_age(age):
        if age < 6:
            return '0-6'
        elif age < 13:
            return '6-13'
        else:
            return '13+'
        
    df_processed['VehAge'] = df_processed['VehAge'].apply(categorize_veh_age)

    return df_processed

# Calculate claim frequency, i.e., the target variable y
data_df['ClaimFreq'] = data_df['ClaimNb'] / data_df['Exposure']

# Preprocess features
data_processed_df = preprocess_features(data_df)

### Question 1.b

In [11]:
# Print some statistics about the data
print(f"Average Claim Frequency: {data_df['ClaimFreq'].mean()}")
print(f"Number of Claims: {data_df['ClaimNb'].sum()}")
number_of_policies_without_claims = len(data_df[data_df['ClaimNb'] == 0])
num_of_policies_without_claims_rel = 100*number_of_policies_without_claims / len(data_df)
print(f"Number of Policies without Claims: {number_of_policies_without_claims} ({num_of_policies_without_claims_rel:.2f}%)")
print(f"Number of NaNs: {data_processed_df.isna().sum().sum()}")

Average Claim Frequency: 0.11790710080746372
Number of Claims: 26383
Number of Policies without Claims: 653069 (96.32%)
Number of NaNs: 0


**COMMENT:** We can see that most policy holders have no claims at all (i.e., $96.32\%$ of the policy holders have zero claims). So when we would model the problem as binary classification, we would have a very imbalanced dataset. Therefore, Poisson regression is a good choice here. \
Furthermore, we don't have any NaNs in the dataset, so we don't have to deal with missing values – nice.

In [12]:
# Train-test split
X = data_processed_df.drop(['Exposure', 'ClaimNb', 'ClaimFreq'], axis=1)
y = data_df['ClaimFreq']
exposure = data_df['Exposure']

X_train, X_test, y_train, y_test, exposure_train, exposure_test = model_selection.train_test_split(
    X, y, exposure, test_size=0.1, random_state=RANDOM_STATE
)

# Separate features
categorical_features = ['VehAge', 'VehBrand', 'VehGas', 'Region']
numerical_features = ['VehPower', 'DrivAge', 'BonusMalus', 'Density']

# Scale numerical features
scaler = preprocessing.StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# One-hot encode categorical features using the training data
X_train = pd.get_dummies(X_train, columns=categorical_features)
X_test = pd.get_dummies(X_test, columns=categorical_features)

print("Features: ", X_train.columns)
print("Number of Features: ", len(X_train.columns))
assert len(X_train.columns) == len(X_test.columns)  # Ensure that the test and train set have same dimensions

Features:  Index(['VehPower', 'DrivAge', 'BonusMalus', 'Density', 'VehAge_0-6',
       'VehAge_13+', 'VehAge_6-13', 'VehBrand_B1', 'VehBrand_B10',
       'VehBrand_B11', 'VehBrand_B12', 'VehBrand_B13', 'VehBrand_B14',
       'VehBrand_B2', 'VehBrand_B3', 'VehBrand_B4', 'VehBrand_B5',
       'VehBrand_B6', 'VehGas_Diesel', 'VehGas_Regular', 'Region_R11',
       'Region_R21', 'Region_R22', 'Region_R23', 'Region_R24', 'Region_R25',
       'Region_R26', 'Region_R31', 'Region_R41', 'Region_R42', 'Region_R43',
       'Region_R52', 'Region_R53', 'Region_R54', 'Region_R72', 'Region_R73',
       'Region_R74', 'Region_R82', 'Region_R83', 'Region_R91', 'Region_R93',
       'Region_R94'],
      dtype='object')
Number of Features:  42


In [13]:
# Train Poisson GLM

# NOTE: We choose alpha=0 (without regularization)
poisson_reg = linear_model.PoissonRegressor(alpha=0)
poisson_reg.fit(X_train, y_train, sample_weight=exposure_train)

y_train_pred = poisson_reg.predict(X_train)
y_test_pred = poisson_reg.predict(X_test)

In [14]:
# Print MAE, MSE and loss on train and test data sets
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_poisson_deviance

def calculate_loss(y_true, y_pred, exposure, print_results=True):
    """Calculates MAE, MSE and the exposure-weighted Poisson deviance loss."""
    # Mean Absolute Error (MAE) and Mean Squared Error (MSE)
    mae = mean_absolute_error(y_true, y_pred)
    mae_weighted = mean_absolute_error(y_true, y_pred, sample_weight=exposure)
    mse = mean_squared_error(y_true, y_pred)
    mse_weighted = mean_squared_error(y_true, y_pred, sample_weight=exposure)
    
    # Calculate Poisson deviance loss (manually, and using sklearn built-in function)
    y_true_log = np.where(y_true > 0, y_true * np.log(y_true), 0)
    y_pred_log = np.where(y_pred > 0, y_true * np.log(y_pred), 0)
    poisson_deviance = 2 * (y_pred - y_true - y_pred_log + y_true_log)
    exp_weighted_poisson_loss = np.sum(exposure * poisson_deviance) / np.sum(exposure) if np.sum(exposure) > 0 else 0
    
    exp_weighted_poisson_loss_sklearn = mean_poisson_deviance(y_true, y_pred, sample_weight=exposure)
    
    if print_results:
        print("---------------------------------")
        print(f"MAE: {mae:.4f}")
        print(f"MAE (Weighted): {mae_weighted:.4f}")
        print(f"MSE: {mse:.4f}")
        print(f"MSE (Weighted): {mse_weighted:.4f}")
        print(f"Exposure-Weighted Poisson Deviance Loss: {exp_weighted_poisson_loss:.4f}")
        print(f"Exposure-Weighted Poisson Deviance Loss (sklearn): {exp_weighted_poisson_loss_sklearn:.4f}")
        print("---------------------------------")
    return mae, mae_weighted, mse, mse_weighted, exp_weighted_poisson_loss


# In-sample loss
print("In-sample Loss:")
_ = calculate_loss(y_train, y_train_pred, exposure_train)

# Out-of-sample loss
print("\nOut-of-sample Loss:")
_ = calculate_loss(y_test, y_test_pred, exposure_test)

In-sample Loss:
---------------------------------
MAE: 0.1876
MAE (Weighted): 0.1377
MSE: 3.7329
MSE (Weighted): 0.2347
Exposure-Weighted Poisson Deviance Loss: 0.4563
Exposure-Weighted Poisson Deviance Loss (sklearn): 0.4563
---------------------------------

Out-of-sample Loss:
---------------------------------
MAE: 0.2010
MAE (Weighted): 0.1387
MSE: 6.5031
MSE (Weighted): 0.2597
Exposure-Weighted Poisson Deviance Loss: 0.4627
Exposure-Weighted Poisson Deviance Loss (sklearn): 0.4627
---------------------------------


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


**COMMENT:** From the small increase in the loss from the training to the test set, we can conclude that the model generalizes well.

# Exercise 2 (Poisson FNN)

### Question 2.a

In [None]:
# Neural network implementation

### Question 2.b

In [None]:
# Train Poisson feedforward neural network model

# Print MAE, MSE and loss on train and test data sets

# Make sure your model outperforms the Poisson GLM model of Question 1.b.

# Exercise 3 (Tree-based methods)

### Question 3.a

In [None]:
# Implement a regression tree

# Cross-validation

# Print MAE, MSE and loss on train and test data sets

### Question 3.b

In [None]:
# Implement a random forest model

# Cross-validation

# Print MAE, MSE and loss on train and test data sets

### Question 3.c

In [None]:
# Implement gradient boosted trees

# Cross-validation

# Print MAE, MSE and loss on train and test data sets