In [1]:
cd /Users/giorgioclauser/Projects/axa_challenge/src

/Users/giorgioclauser/Projects/axa_challenge/src


# Covariate shift based weights
Reference: https://github.com/erlendd/covariate-shift-adaption/blob/master/Supervised%20classification%20by%20covariate%20shift%20adaption.ipynb

In [34]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix  # da implementare
from datetime import datetime
import pickle

from models import model_train
from utils import (
    reduce_mem_usage,
    sub_template_creator,
    prepare_train_test_before_scoring,
    fetch_data,
    subset_data,
    submission_generator,
)

In [3]:
cd /Users/giorgioclauser/Projects/axa_challenge/

/Users/giorgioclauser/Projects/axa_challenge


## Put train and test in the same dataframe

In [13]:
gc.collect()

# Importing data
training = fetch_data("train")
test = fetch_data("validation")

training = reduce_mem_usage(training)
test = reduce_mem_usage(test)

training["is_test"] = 0
test["is_test"] = 1

# Eseguo subset del dataset di train
full = pd.concat([training.drop(columns=["target_class"]),test], sort=True)

Memory usage of dataframe is 820.12 MB --> 657.86 MB (Decreased by 19.8%)
Memory usage of dataframe is 81.56 MB --> 64.97 MB (Decreased by 20.3%)


In [11]:
print(full.shape)
print(str(training.shape[0]+test.shape[0]))

(1016745, 391)
1016745


## Use lasso regression to select top ~50 features

In [24]:
from sklearn.linear_model import LassoCV
reg = LassoCV(cv=5, random_state=0, alphas=[.00007], max_iter=10000, tol=0.001).fit(full.drop(columns=["claim_id", "is_test"]), full.is_test)

  tol, rng, random, positive)


In [25]:
df_FI = pd.DataFrame({"col_name": full.drop(columns=["claim_id", "is_test"]).columns, "importanza": reg.coef_})
print(df_FI.loc[df_FI.importanza != 0].shape)
df_FI.loc[df_FI.importanza != 0]

(51, 2)


Unnamed: 0,col_name,importanza
12,business_rule_20,0.0001360047
38,business_rule_7,0.005566045
41,business_type__commercial,0.004755894
45,cid_vehicles_number,0.0004060151
46,claim_amount_category,0.002420953
75,claim_type_desc__md_rca_cid_misto,0.0002375782
81,claim_type_desc__pa_ard_eventi_speciali,-7.030074e-05
93,client_responsibility,1.267865e-05
103,coverage__responsabilita_civile_auto,0.006047096
104,coverage_excess_amount__sum,-7.506861e-07


## Train classifier to get weights as probabilities to belong in test set
I use random forest badly tuned to avoid overfitting

In [82]:
from config import lasso_ftrs_w

X = full.sample(len(full)).copy()

X_train = X[lasso_ftrs_w]
y_train = X["is_test"]

n_fold = 20  

folds = KFold(n_fold)

In [83]:
from sklearn.ensemble import RandomForestClassifier
start_time = datetime.now()
mdl_list = []
ROC_Avg = 0
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    now = datetime.now()

    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfclf = RandomForestClassifier(max_features=2, max_depth=1, n_estimators=10, n_jobs=-1, max_samples=.3)
    rfclf.fit(X_train_, y_train_)
    pred_mdl = rfclf

    val = pred_mdl.predict_proba(X_valid)[:, 1]
    print("finish pred")
    del X_valid
    ROC_auc = roc_auc_score(y_valid, val)
    print("ROC accuracy: {}".format(ROC_auc))
    ROC_Avg = ROC_Avg + ROC_auc
    del val, y_valid
    model_time = datetime.now() - now
    total_exp_time = model_time * n_fold
    current_time = datetime.now() - start_time
    print(
        "The current model took in total",
        model_time,
        "\n Still missing, this time:",
        str(total_exp_time - current_time),
    )
    gc.collect()
    mdl_list.append(pred_mdl)
    del rfclf, pred_mdl

# Print average ROC
print("\nAverage ROC is: ", ROC_Avg / n_fold)

i = 0
for mdl in mdl_list:
    with open(
        f"./src/weights_mdls/mdl_{str(i)}.pkl", "wb"
    ) as pickle_file:
        pickle.dump(mdl, pickle_file)
    i = i + 1

0
finish pred
ROC accuracy: 0.9208568937421435
The current model took in total 0:00:01.171055 
 Still missing, this time: 0:00:22.244134
1
finish pred
ROC accuracy: 0.9989715886436175
The current model took in total 0:00:01.185872 
 Still missing, this time: 0:00:21.299119
2
finish pred
ROC accuracy: 0.976573945570387
The current model took in total 0:00:01.286462 
 Still missing, this time: 0:00:21.968517
3
finish pred
ROC accuracy: 0.9853242730782
The current model took in total 0:00:01.199371 
 Still missing, this time: 0:00:18.967443
4
finish pred
ROC accuracy: 0.9375578318653305
The current model took in total 0:00:01.169088 
 Still missing, this time: 0:00:17.137530
5
finish pred
ROC accuracy: 0.9947438459962084
The current model took in total 0:00:01.303666 
 Still missing, this time: 0:00:18.469611
6
finish pred
ROC accuracy: 0.9542186686775023
The current model took in total 0:00:01.198963 
 Still missing, this time: 0:00:15.112240
7
finish pred
ROC accuracy: 0.706069608990902

## Test lightgbm with weights

In [86]:
import lightgbm as lgb
lgbm_X = lgb.Dataset(data = training.drop(columns=["target_class", "claim_id"]), label = training.target_class)

In [91]:
lgbm_prm = {'num_leaves': 512,  # was 512 - default 31
            'max_depth': -1,  # default -1, was 9
            'learning_rate': 0.1,  # default 0.1
            'feature_fraction': 0.4,  # default 1 was 0.4,
            'bagging_fraction': 0.4,  # default 1 was 0.4, # subsample by row
            'metric': "auc",  # binary_logloss auc
            'boosting_type': "gbdt",  # goss # dart --> speed: goss>gbdt>dart
            'lambda_l1': 0.4,  # default 0 - 0.4
            'lambda_l2': 0.6,  # default 0 - 0.6
            'scale_pos_weight': 1,  # defualt 1
           }

In [92]:
lgbclf = lgb.train(lgbm_prm,
                   lgbm_X,
                   num_boost_round = 512)

In [95]:
asd = lgbclf.predict(training.drop(columns=["target_class", "claim_id"]))

## Get weights from models

In [32]:
def get_covariate_weights(X, ftrs):
    """Returns probability for each observation to belong to training set."""
    
    df = X[ftrs].copy()
    
    # Pre allocate vector with predictions
    df["pred_weights"] = 0
    
    # For each model, get the prediction
    from os import listdir
    from os.path import isfile, join
    models = [f for f in listdir(f"./src/weights_mdls/") if isfile(join(f"./src/weights_mdls/", f))]
    for mdl in models:
        # Get the model
        with open(f"./src/weights_mdls/{mdl}", "rb") as pickle_file:
            this_mdl = pickle.load(pickle_file)
            # Get the prediction
        prd = this_mdl.predict_proba(df[ftrs])[:, 1]
        df["pred_weights"] = prd + df["pred_weights"]
    
    # Normalize the weights
    df["pred_weights"] = df["pred_weights"]/len(models)
    weights = (1./df["pred_weights"]) - 1. 
    weights /= np.mean(weights)

    return weights

In [35]:
asd = get_covariate_weights(training, lasso_ftrs_w)

In [36]:
asd

0         1.148846
1         1.157233
2         0.905480
3         0.749258
4         0.873749
5         1.200910
6         1.074414
7         0.720926
8         0.744536
9         1.114851
10        1.043253
11        0.958196
12        0.724940
13        0.892283
14        1.111553
15        0.750511
16        1.075371
17        0.759643
18        0.885240
19        0.875059
20        1.088834
21        0.994967
22        0.883189
23        1.018745
24        0.830071
25        1.043123
26        0.765133
27        0.962673
28        1.097451
29        0.731486
            ...   
924654    1.037847
924655    1.131161
924656    1.001228
924657    0.923322
924658    1.142733
924659    0.712479
924660    0.874799
924661    1.147049
924662    0.930915
924663    1.009798
924664    0.716265
924665    0.953533
924666    1.112918
924667    1.069070
924668    0.894840
924669    1.113644
924670    1.141060
924671    1.023513
924672    1.086886
924673    0.909940
924674    1.020622
924675    1.

In [49]:
from sklearn.ensemble import RandomForestClassifier
start_time = datetime.now()
mdl_list = []
ROC_Avg = 0
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    now = datetime.now()

    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

    rfclf = RandomForestClassifier(max_features='log2', max_depth=4, n_estimators=100, n_jobs=-1)
    rfclf.fit(X_train_, y_train_)
    pred_mdl = rfclf

    val = pred_mdl.predict_proba(X_valid)[:, 1]
    print("finish pred")
    del X_valid
    ROC_auc = roc_auc_score(y_valid, val)
    print("ROC accuracy: {}".format(ROC_auc))
    ROC_Avg = ROC_Avg + ROC_auc
    del val, y_valid
    model_time = datetime.now() - now
    total_exp_time = model_time * n_fold
    current_time = datetime.now() - start_time
    print(
        "The current model took in total",
        model_time,
        "\n Still missing, this time:",
        str(total_exp_time - current_time),
    )
    gc.collect()
    mdl_list.append(pred_mdl)
    del rfclf, pred_mdl

# Print average ROC
print("\nAverage ROC is: ", ROC_Avg / n_fold)

i = 0
for mdl in mdl_list:
    with open(
        f"./src/weights_mdls/rf_{str(i)}.pkl", "wb"
    ) as pickle_file:
        pickle.dump(mdl, pickle_file)
    i = i + 1

0
finish pred
ROC accuracy: 0.9999591681274306
The current model took in total 0:00:14.087334 
 Still missing, this time: 0:00:56.343105
1
finish pred
ROC accuracy: 0.9999636874685617
The current model took in total 0:00:15.024232 
 Still missing, this time: 0:00:45.940619
2
finish pred
ROC accuracy: 0.9999946023452301
The current model took in total 0:00:13.052963 
 Still missing, this time: 0:00:22.972052
3
finish pred
ROC accuracy: 0.9999504457048228
The current model took in total 0:00:13.372107 
 Still missing, this time: 0:00:11.127311
4
finish pred
ROC accuracy: 0.9999499469855719
The current model took in total 0:00:14.253533 
 Still missing, this time: 0:00:01.221013

Average ROC is:  0.9999635701263234


In [14]:
lasso_ftrs_w = [
    "business_rule_20",
    "business_rule_7",
    "business_type__commercial",
    "cid_vehicles_number",
    "claim_amount_category",
    "claim_type_desc__md_rca_cid_misto",
    "claim_type_desc__pa_ard_eventi_speciali",
    "client_responsibility",
    "coverage__responsabilita_civile_auto",
    "coverage_excess_amount__sum",
    "coverage_insured_amount__sum",
    "diff_days_claim_date_notif_date",
    "diff_days_claim_date_original_start_date",
    "diff_days_claim_date_policy_end_date",
    "diff_days_claim_date_policy_start_date",
    "diff_year_now_fp__date_of_birth",
    "diff_year_now_tp__date_of_birth",
    "dist_claim_fp",
    "dist_claim_tp",
    "dist_fp_tp",
    "driving_licence_type__other",
    "fp__vehicle_type__car",
    "fp__vehicle_type__truck",
    "insured_item_code__none",
    "insured_item_unit_section__fur4a",
    "insured_item_unit_section__other",
    "insured_item_unit_section__rca",
    "insured_item_unit_section__rcap",
    "insured_value2__sum_category",
    "network_feature_20",
    "network_feature_25",
    "network_feature_26",
    "network_feature_33",
    "network_feature_35",
    "network_feature_42",
    "party_type__105",
    "party_type__other",
    "policy_branch__none",
    "policy_broker_code__none",
    "policy_premium_amount_category",
    "policy_status__11",
    "policy_status__none",
    "policy_status__other",
    "policy_transaction_description__none",
    "region_of_claim__lombardia",
    "risk_code__apfu4a",
    "risk_code__pvrca",
    "tarif_type__bonusmalus",
    "tarif_type__none",
    "total_reserved_category",
    "vehicle_is_damaged__sum",
]