In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Restrict minor warnings
import warnings
warnings.filterwarnings('ignore')

# to display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 100

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as CTT
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

from uuid import UUID
from uuid import uuid4

In [17]:
fn = "/ssd003/projects/pets/datasets/caravan-insurance-challenge.csv"
df = pd.read_csv(fn)

In [18]:
df.shape

(9822, 87)

In [19]:
df.head(3)

Unnamed: 0,ORIGIN,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,train,33,1,3,2,8,0,5,1,3,7,0,2,1,2,6,1,2,7,1,0,1,2,5,2,1,1,2,6,1,1,8,8,0,1,8,1,0,4,5,0,0,4,3,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,train,37,1,2,2,8,1,4,1,4,6,2,2,0,4,5,0,5,4,0,0,0,5,0,4,0,2,3,5,0,2,7,7,1,2,6,3,2,0,5,2,0,5,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,train,37,1,2,2,8,0,4,2,4,3,2,4,4,4,2,0,5,4,0,0,0,7,0,2,0,5,0,4,0,7,2,7,0,2,9,0,4,5,0,0,0,3,4,2,0,0,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [20]:
df.columns

Index(['ORIGIN', 'MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD',
       'MGODRK', 'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV',
       'MFALLEEN', 'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG',
       'MBERHOOG', 'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO',
       'MSKA', 'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1',
       'MAUT2', 'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045',
       'MINK4575', 'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART',
       'PWABEDR', 'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT',
       'PAANHANG', 'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG',
       'PGEZONG', 'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS',
       'PINBOED', 'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT',
       'ABESAUT', 'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT',
       'ABROM', 'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND',
       'AZEILPL', 'APLEZIER',

In [23]:
ss_features = df.columns[43:-1]
ss_features

Index(['MKOOPKLA', 'PWAPART', 'PWABEDR', 'PWALAND', 'PPERSAUT', 'PBESAUT',
       'PMOTSCO', 'PVRAAUT', 'PAANHANG', 'PTRACTOR', 'PWERKT', 'PBROM',
       'PLEVEN', 'PPERSONG', 'PGEZONG', 'PWAOREG', 'PBRAND', 'PZEILPL',
       'PPLEZIER', 'PFIETS', 'PINBOED', 'PBYSTAND', 'AWAPART', 'AWABEDR',
       'AWALAND', 'APERSAUT', 'ABESAUT', 'AMOTSCO', 'AVRAAUT', 'AAANHANG',
       'ATRACTOR', 'AWERKT', 'ABROM', 'ALEVEN', 'APERSONG', 'AGEZONG',
       'AWAOREG', 'ABRAND', 'AZEILPL', 'APLEZIER', 'AFIETS', 'AINBOED',
       'ABYSTAND'],
      dtype='object')

In [24]:
autoscaler = StandardScaler()
df[ss_features] = autoscaler.fit_transform(df[ss_features])

In [25]:
categorical_cols = ['MOSTYPE','MOSHOOFD']
df = pd.get_dummies(df, columns = categorical_cols)


In [26]:
df.columns

Index(['ORIGIN', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MGODRK', 'MGODPR',
       'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA',
       ...
       'MOSHOOFD_1', 'MOSHOOFD_2', 'MOSHOOFD_3', 'MOSHOOFD_4', 'MOSHOOFD_5',
       'MOSHOOFD_6', 'MOSHOOFD_7', 'MOSHOOFD_8', 'MOSHOOFD_9', 'MOSHOOFD_10'],
      dtype='object', length=135)

In [27]:
uuids = np.array([uuid4() for _ in range(len(df))])
df['UUID'] = uuids
df['UUID'].nunique()

9822

In [28]:
pd.crosstab(df.ORIGIN,df.CARAVAN)

CARAVAN,0,1
ORIGIN,Unnamed: 1_level_1,Unnamed: 2_level_1
test,3762,238
train,5474,348


In [47]:
demographic_col = [col for col in df if col.startswith('M')]
len(demographic_col)
demographic_col.append('UUID')
demograhic_df = df[demographic_col]
demograhic_df
demograhoc_df.to_csv("demograhic_info.csv")

91

Unnamed: 0,MAANTHUI,MGEMOMV,MGEMLEEF,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,MOSTYPE_1,MOSTYPE_2,MOSTYPE_3,MOSTYPE_4,MOSTYPE_5,MOSTYPE_6,MOSTYPE_7,MOSTYPE_8,MOSTYPE_9,MOSTYPE_10,MOSTYPE_11,MOSTYPE_12,MOSTYPE_13,MOSTYPE_15,MOSTYPE_16,MOSTYPE_17,MOSTYPE_18,MOSTYPE_19,MOSTYPE_20,MOSTYPE_21,MOSTYPE_22,MOSTYPE_23,MOSTYPE_24,MOSTYPE_25,MOSTYPE_26,MOSTYPE_27,MOSTYPE_28,MOSTYPE_29,MOSTYPE_30,MOSTYPE_31,MOSTYPE_32,MOSTYPE_33,MOSTYPE_34,MOSTYPE_35,MOSTYPE_36,MOSTYPE_37,MOSTYPE_38,MOSTYPE_39,MOSTYPE_40,MOSTYPE_41,MOSHOOFD_1,MOSHOOFD_2,MOSHOOFD_3,MOSHOOFD_4,MOSHOOFD_5,MOSHOOFD_6,MOSHOOFD_7,MOSHOOFD_8,MOSHOOFD_9,MOSHOOFD_10,UUID
0,1,3,2,0,5,1,3,7,0,2,1,2,6,1,2,7,1,0,1,2,5,2,1,1,2,6,1,1,8,8,0,1,8,1,0,4,5,0,0,4,-0.630542,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,018274bc-00a7-4e3f-a868-f4729674b3e3
1,1,2,2,1,4,1,4,6,2,2,0,4,5,0,5,4,0,0,0,5,0,4,0,2,3,5,0,2,7,7,1,2,6,3,2,0,5,2,0,5,-0.130244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,58759529-34ba-4a93-921c-628ebce192cc
2,1,2,2,0,4,2,4,3,2,4,4,4,2,0,5,4,0,0,0,7,0,2,0,5,0,4,0,7,2,7,0,2,9,0,4,5,0,0,0,3,-0.130244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,ad1f4130-52cf-4e90-9b07-e10b6fdd624b
3,1,3,3,2,3,2,4,5,2,2,2,3,4,3,4,2,4,0,0,3,1,2,3,2,1,4,0,5,4,9,0,0,7,2,1,5,3,0,0,4,-0.130244,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,480887f0-7020-442f-bd43-2f90de198bac
4,1,4,2,1,4,1,4,7,1,2,2,4,4,5,4,0,0,5,4,0,0,0,9,0,0,0,0,4,5,6,2,1,5,4,0,0,9,0,0,6,-0.630542,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,f0cc7127-7014-4a88-b4f7-5d6b0514c34c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9817,1,2,4,0,7,2,0,5,2,2,2,6,2,0,3,6,5,0,0,1,0,4,2,0,2,4,2,4,5,4,4,2,3,6,3,6,0,0,0,2,-0.630542,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,5014089d-65a4-41eb-b317-fa9cafa6375e
9818,1,2,3,1,5,1,3,4,2,4,4,4,2,2,4,4,2,0,0,3,3,3,1,1,2,5,1,7,2,6,0,3,7,2,3,3,2,2,0,4,-1.130839,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,c38c79b6-142f-4dc9-90db-7d8f84631a11
9819,1,2,3,1,5,1,3,7,0,2,2,5,3,2,3,4,2,0,0,3,4,2,1,1,3,5,0,7,2,6,1,2,6,3,2,5,3,0,0,4,-0.630542,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,ca8c617b-a311-4146-b1b3-4e88d19f0441
9820,1,3,3,1,4,2,3,7,1,2,2,3,4,1,3,5,1,1,1,2,3,3,2,2,2,4,1,4,5,6,2,2,7,2,1,4,4,1,0,4,-0.630542,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,b331abdd-ae63-4756-9d91-1c2d58317675


In [48]:
prdct_insurance_stats_col = [col for col in df if (col.startswith('A') or col.startswith('P'))]
len(prdct_insurance_stats_col)
prdct_insurance_stats_col.append('UUID')
prdct_insurance_stats_df = df[prdct_insurance_stats_col]
prdct_insurance_stats_df
prdct_insurance_stats_df.to_csv("prdct_insurance_stats_info.csv")

42

Unnamed: 0,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,UUID
0,-0.799697,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,1.674794,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,-0.813089,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,018274bc-00a7-4e3f-a868-f4729674b3e3
1,1.291246,-0.108971,-0.145162,-1.011924,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,0.080046,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,3.252150,-0.111463,-0.14745,-0.915658,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,58759529-34ba-4a93-921c-628ebce192cc
2,1.291246,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,0.080046,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,1.219530,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,ad1f4130-52cf-4e90-9b07-e10b6fdd624b
3,-0.799697,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,0.080046,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,-0.813089,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,480887f0-7020-442f-bd43-2f90de198bac
4,-0.799697,-0.108971,-0.145162,-1.011924,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,2.206376,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,-0.813089,-0.111463,-0.14745,-0.915658,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,f0cc7127-7014-4a88-b4f7-5d6b0514c34c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9817,1.291246,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,0.611628,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,1.219530,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,5014089d-65a4-41eb-b317-fa9cafa6375e
9818,-0.799697,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,6.367408,-0.060972,-0.087662,-0.062118,1.143211,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,-0.813089,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,2.393733,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,c38c79b6-142f-4dc9-90db-7d8f84631a11
9819,-0.799697,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,0.611628,-0.028551,-0.062539,-0.161278,4.649707,-0.114389,-0.813089,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,0.759020,-0.030284,-0.065271,-0.150483,10.702967,-0.11762,ca8c617b-a311-4146-b1b3-4e88d19f0441
9820,-0.799697,-0.108971,-0.145162,1.041754,-0.096942,-0.192286,-0.037288,-0.0963,-0.154828,-0.053412,-0.265185,-0.222180,-0.060972,-0.087662,-0.062118,-0.983119,-0.028551,-0.062539,-0.161278,-0.078955,-0.114389,-0.813089,-0.111463,-0.14745,0.727608,-0.085418,-0.179848,-0.032747,-0.098095,-0.137819,-0.047226,-0.265744,-0.207644,-0.067843,-0.08947,-0.060041,-1.022791,-0.030284,-0.065271,-0.150483,-0.091215,-0.11762,b331abdd-ae63-4756-9d91-1c2d58317675
