# Coding Challenge Huk-Coburg

In [None]:
import mlflow
import sklearn
import pandas as pd

mlflow.sklearn.autolog()
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("ClaimsClassification")

## Data Preparation - Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_pickle("data/df_full_2.pkl")

In [None]:
# ClaimNorm, ClaimAmount and exposure are dependent variables of ClaimNorm 
df = df.drop(["ClaimNorm", "ClaimAmount", "Exposure"], axis=1)

In [None]:
labels = df["ClaimNb"].apply(lambda n: int(bool(n))).to_numpy()
data = df.drop("ClaimNb", axis=1)

### Train Test Split

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.1, random_state=0)

#### Is the split unbiased (similar ratio of no-claims)?

In [None]:
print("Ratio no-claims in Test", sum(labels_test > 0)/len(labels_test))

In [None]:
print("Ratio no-claims in Train", sum(labels_train > 0)/len(labels_train))

## Metrics
The standard metric for a classification task is F1. 

In [None]:
from sklearn.metrics import f1_score

In [None]:
from sklearn.dummy import DummyClassifier
with mlflow.start_run(run_name="Baseline"):
    dummy_model = DummyClassifier(strategy="constant", constant=1)
    dummy_model.fit(data_train, labels_train)
    dummy_model.score(data_test, labels_test)
    labels_pred = dummy_model.predict(data_test)
    f1_score(labels_test, labels_pred)  

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
with mlflow.start_run(run_name="LogistRegression"):
    logistic_model = LogisticRegression()
    logistic_model.fit(data_train, labels_train)
    logistic_model.score(data_test, labels_test)
    labels_pred = logistic_model.predict(data_test)
    f1_score(labels_test, labels_pred)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
with mlflow.start_run(run_name="RandomForest"):
    model = RandomForestClassifier(class_weight="balanced")
    model.fit(data_train, labels_train)
    model.score(data_test, labels_test)
    labels_pred = model.predict(data_test)
    f1_score(labels_test, labels_pred)

## Hyperparameter Tuning - Ridge Regression

In [None]:
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import Ridge
from scipy.stats import uniform

ridge = Ridge()
distributions = dict(tol=[1e-4, 5e-4, 1e-3, 5e-3],
                     alpha=uniform(loc=0, scale=5))

with mlflow.start_run(run_name="HT-LinearModel"):
    reg = RandomizedSearchCV(ridge, distributions, random_state=0)
    search = reg.fit(data_train, labels_train)

In [None]:
search.score(data_test, labels_test)

## Hyperparameter Tuning - Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
distributions = dict(n_estimators=[10,50,100],
                     max_depth=[2,3,4,5])

with mlflow.start_run(run_name="HT-RandomForest"):
    reg = RandomizedSearchCV(rf, distributions, random_state=0)
    search = reg.fit(data_train, labels_train)
    search.score(data_test, labels_test)