# Coding Challenge Huk-Coburg

Experiment to predict ClaimNorm directly from the features of the data. 

Sklearn offers many algorithms, which are very performant on structured data and quick to implement.

MLflow tracks the scores of the experiments.

#### Metric
As metric I choose R2, as it is the standard metric for regression tasks in sklearn and always falls between -1 and 1.

In [None]:
import mlflow
import sklearn
import pandas as pd

mlflow.sklearn.autolog()
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("ClaimNormPrediction")

## Data Preparation - Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_pickle("data/df_filter.pkl")

In [None]:
# ClaimNb, ClaimAmount and exposure are dependent variables of ClaimNorm 
df = df.drop(["ClaimNb", "ClaimAmount", "Exposure"], axis=1)

In [None]:
labels = df["ClaimNorm"].to_numpy()
data = df.drop("ClaimNorm", axis=1)

### Train Test Split

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.1, random_state=0)

#### Is the split unbiased (similar ratio of no-claims)?

In [None]:
print("Ratio no-claims in Test", sum(labels_test > 0)/len(labels_test))

In [None]:
print("Ratio no-claims in Train", sum(labels_train > 0)/len(labels_train))

## Baseline - Linear Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
with mlflow.start_run(run_name="Baseline"):
    linear_model = LinearRegression()
    linear_model.fit(data_train, labels_train)
    linear_model.score(data_test, labels_test)

## Hyperparameter Tuning - Ridge Regression

In [None]:
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import Ridge
from scipy.stats import uniform

ridge = Ridge()
distributions = dict(tol=[1e-4, 5e-4, 1e-3, 5e-3],
                     alpha=uniform(loc=0, scale=5))

with mlflow.start_run(run_name="HT-LinearModel"):
    reg = RandomizedSearchCV(ridge, distributions, random_state=0)
    search = reg.fit(data_train, labels_train)

In [None]:
search.score(data_test, labels_test)

## Hyperparameter Tuning - Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name="RandomForest-resample"):
    model = RandomForestRegressor()
    model.fit(data_train, labels_train)
    model.score(data_test, labels_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
distributions = dict(n_estimators=[10,50,100],
                     max_depth=[2,3,4,5])

with mlflow.start_run(run_name="HT-RandomForest"):
    reg = RandomizedSearchCV(rf, distributions, random_state=0)
    search = reg.fit(data_train, labels_train)
    search.score(data_test, labels_test)

## Downsampling Majority

In [None]:
from sklearn.utils import resample

In [None]:
data_train_full = data_train
data_train_full["label"] = labels_train

Split the data with and without claims to recreate a training data set with similar amount of both categories

In [None]:
data_neg = data_train_full[data_train_full["label"]==0]
data_pos = data_train_full[data_train_full["label"]>0]
n_pos=len(data_pos)
data_neg_resample = resample(data_neg, n_samples=n_pos, replace=False)
data_resample = pd.concat([data_neg_resample, data_pos])
data_train_resample = data_resample.drop("label", axis=1)
labels_train_resample = data_resample["label"].to_numpy()
print("Ratio no-claims in Train", sum(labels_train_resample > 0)/len(labels_train_resample))

In [None]:
with mlflow.start_run(run_name="Baseline-resample"):
    linear_model = LinearRegression()
    linear_model.fit(data_train_resample, labels_train_resample)
    linear_model.score(data_test, labels_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name="RandomForest-resample"):
    model = RandomForestRegressor()
    model.fit(data_train_resample, labels_train_resample)
    model.score(data_test, labels_test)