# Coding Challenge Huk-Coburg

Experiment to predict whether a Claim will be raised from the features of the data. 


#### Metric
As metric I choose F1, as it is the standard metric for classification tasks with biased data as it combines precision and recall. Additionally the confusion matrix offers an interesting picture of the nature of the errors.

In [None]:
import mlflow
import sklearn
import pandas as pd

mlflow.sklearn.autolog()
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("ClaimsClassification")

## Data Preparation - Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_pickle("data/df_full.pkl")

In [None]:
# ClaimNorm, ClaimAmount and exposure are dependent variables of ClaimNorm 
df = df.drop(["ClaimNorm", "ClaimAmount", "Exposure"], axis=1)

I removed exposure, since a longer exposure increases the probability of a claim (see correaltion). A deeper analysis should correct for this fact, e.g. by training with data points with equal exposure.

In [None]:
# Transorm to a binary task
labels = df["ClaimNb"].apply(lambda n: int(bool(n))).to_numpy()
data = df.drop("ClaimNb", axis=1)

### Train Test Split

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.1, random_state=0)

#### Is the split unbiased (similar ratio of no-claims)?

In [None]:
print("Ratio no-claims in Test", sum(labels_test > 0)/len(labels_test))

In [None]:
print("Ratio no-claims in Train", sum(labels_train > 0)/len(labels_train))

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

Here the Baseline is the Dummy model classifing everything as a claim.

In [None]:
from sklearn.dummy import DummyClassifier
with mlflow.start_run(run_name="Baseline"):
    dummy_model = DummyClassifier(strategy="constant", constant=1)
    dummy_model.fit(data_train, labels_train)
    dummy_model.score(data_test, labels_test)
    labels_pred = dummy_model.predict(data_test)
    f1_score(labels_test, labels_pred)  

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
with mlflow.start_run(run_name="LogistRegression"):
    logistic_model = LogisticRegression()
    logistic_model.fit(data_train, labels_train)
    logistic_model.score(data_test, labels_test)
    labels_pred = logistic_model.predict(data_test)
    f1_score(labels_test, labels_pred)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
with mlflow.start_run(run_name="RandomForest-weights100"):
    model = RandomForestClassifier(class_weight={0:1,1:100})
    model.fit(data_train, labels_train)
    model.score(data_test, labels_test)
    labels_pred = model.predict(data_test)
    f1_score(labels_test, labels_pred)
    tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
    mlflow.log_metrics({"TN":tn, "FP":fp, "FN":fn,"TP":tp})

## Class Resampling

In [None]:
from sklearn.utils import resample

In [None]:
data_train_full = data_train
data_train_full["label"] = labels_train

In [None]:
data_neg = data_train_full[data_train_full["label"]==0]
data_pos = data_train_full[data_train_full["label"]==1]
n_pos=len(data_pos)
data_neg_resample = resample(data_neg, n_samples=n_pos, replace=False)
data_resample = pd.concat([data_neg_resample, data_pos])
data_train_resample = data_resample.drop("label", axis=1)
label_train_resample = data_resample["label"].to_numpy()
sum(label_train_resample) / len(label_train_resample)

In [None]:
with mlflow.start_run(run_name="RandomForest-downsample"):
    model = RandomForestClassifier(class_weight={0:1,1:100})
    model.fit(data_train_resample, label_train_resample)
    model.score(data_test, labels_test)
    labels_pred = model.predict(data_test)
    f1_score(labels_test, labels_pred)
    tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
    mlflow.log_metrics({"TN":tn, "FP":fp, "FN":fn,"TP":tp})

In [None]:
with mlflow.start_run(run_name="LogistRegression-downsample"):
    logistic_model = LogisticRegression()
    logistic_model.fit(data_train_resample, label_train_resample)
    logistic_model.score(data_test, labels_test)
    labels_pred = logistic_model.predict(data_test)
    f1_score(labels_test, labels_pred)
    tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
    mlflow.log_metrics({"TN":tn, "FP":fp, "FN":fn,"TP":tp})

In [None]:
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

scaler = StandardScaler()
data_train_resample_scale = scaler.fit_transform(data_train_resample)

model = LogisticRegression()
distributions = dict(tol=[1e-4, 5e-4, 1e-3, 5e-3],
                     C=uniform(loc=0.1, scale=2),
                     max_iter=[50,100,150,200]
                     )

with mlflow.start_run(run_name="HT-LogistRegression-scaler-downsample"):
    randsearch = RandomizedSearchCV(model, distributions, random_state=0)
    search = randsearch.fit(data_train_resample_scale, label_train_resample)
    data_test_scale = scaler.transform(data_test)
    labels_pred = search.predict(data_test_scale)
    f1_score(labels_test, labels_pred)
    tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
    mlflow.log_metrics({"TN":tn, "FP":fp, "FN":fn,"TP":tp})


In [None]:
with mlflow.start_run(run_name="RandomForest-downsample-equal"):
    model = RandomForestClassifier()
    model.fit(data_train_resample, label_train_resample)
    model.score(data_test, labels_test)
    labels_pred = model.predict(data_test)
    f1_score(labels_test, labels_pred)
    tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
    mlflow.log_metrics({"TN":tn, "FP":fp, "FN":fn,"TP":tp})

## Feature Analysis

Which features are most relevant for the prediction of a likely claim?

In [None]:
list(zip(data.columns, search.best_estimator_.coef_.ravel()))

Check typical probability predictions

In [None]:
search.best_estimator_.predict_proba(data_test_scale)

In [None]:
confusion_matrix(labels_test, labels_pred)