In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Restrict minor warnings
import warnings
warnings.filterwarnings('ignore')

# to display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 100

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as CTT
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

from uuid import UUID
from uuid import uuid4

In [64]:
fn = "/ssd003/projects/pets/datasets/caravan-insurance-challenge.csv"
df = pd.read_csv(fn)

In [65]:
df.shape

(9822, 87)

In [66]:
df.head(3)

Unnamed: 0,ORIGIN,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,train,33,1,3,2,8,0,5,1,3,7,0,2,1,2,6,1,2,7,1,0,1,2,5,2,1,1,2,6,1,1,8,8,0,1,8,1,0,4,5,0,0,4,3,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,train,37,1,2,2,8,1,4,1,4,6,2,2,0,4,5,0,5,4,0,0,0,5,0,4,0,2,3,5,0,2,7,7,1,2,6,3,2,0,5,2,0,5,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,train,37,1,2,2,8,0,4,2,4,3,2,4,4,4,2,0,5,4,0,0,0,7,0,2,0,5,0,4,0,7,2,7,0,2,9,0,4,5,0,0,0,3,4,2,0,0,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [67]:
df.columns

Index(['ORIGIN', 'MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD',
       'MGODRK', 'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV',
       'MFALLEEN', 'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG',
       'MBERHOOG', 'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO',
       'MSKA', 'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1',
       'MAUT2', 'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045',
       'MINK4575', 'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART',
       'PWABEDR', 'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT',
       'PAANHANG', 'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG',
       'PGEZONG', 'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS',
       'PINBOED', 'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT',
       'ABESAUT', 'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT',
       'ABROM', 'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND',
       'AZEILPL', 'APLEZIER',

In [68]:
df_columns = list(df.columns.values)
print(df_columns)
non_numeric = ['MOSTYPE','MOSHOOFD','CARAVAN','ORIGIN']
numeric_features = [feature for feature in df_columns if feature not in non_numeric]
print(numeric_features)

['ORIGIN', 'MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD', 'MGODRK', 'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV', 'MFALLEEN', 'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG', 'MBERHOOG', 'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO', 'MSKA', 'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT2', 'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045', 'MINK4575', 'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PWABEDR', 'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT', 'PAANHANG', 'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG', 'PGEZONG', 'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS', 'PINBOED', 'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT', 'ABESAUT', 'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT', 'ABROM', 'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND', 'AZEILPL', 'APLEZIER', 'AFIETS', 'AINBOED', 'ABYSTAND', 'CARAVAN']
['MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MGODRK', 'MGODP

In [79]:
autoscaler = MinMaxScaler()
df[numeric_features] = autoscaler.fit_transform(df[numeric_features])
df[numeric_features]

Unnamed: 0,MAANTHUI,MGEMOMV,MGEMLEEF,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND
0,0.0,0.4,0.2,0.000000,0.555556,0.2,0.333333,0.777778,0.000000,0.222222,0.111111,0.222222,0.666667,0.111111,0.222222,0.777778,0.111111,0.0,0.111111,0.222222,0.555556,0.222222,0.111111,0.111111,0.222222,0.666667,0.111111,0.111111,0.888889,0.888889,0.000000,0.111111,0.888889,0.111111,0.000000,0.444444,0.555556,0.000000,0.0,0.444444,0.285714,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.625,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
1,0.0,0.2,0.2,0.111111,0.444444,0.2,0.444444,0.666667,0.285714,0.222222,0.000000,0.444444,0.555556,0.000000,0.555556,0.444444,0.000000,0.0,0.000000,0.555556,0.000000,0.444444,0.000000,0.222222,0.333333,0.555556,0.000000,0.222222,0.777778,0.777778,0.111111,0.222222,0.666667,0.333333,0.222222,0.000000,0.555556,0.222222,0.0,0.555556,0.428571,0.666667,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
2,0.0,0.2,0.2,0.000000,0.444444,0.4,0.444444,0.333333,0.285714,0.444444,0.444444,0.444444,0.222222,0.000000,0.555556,0.444444,0.000000,0.0,0.000000,0.777778,0.000000,0.222222,0.000000,0.555556,0.000000,0.444444,0.000000,0.777778,0.222222,0.777778,0.000000,0.222222,1.000000,0.000000,0.444444,0.555556,0.000000,0.000000,0.0,0.333333,0.428571,0.666667,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250,0.0,0.0,0.0,0.000000,0.0,0.5,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
3,0.0,0.4,0.4,0.222222,0.333333,0.4,0.444444,0.555556,0.285714,0.222222,0.222222,0.333333,0.444444,0.333333,0.444444,0.222222,0.444444,0.0,0.000000,0.333333,0.111111,0.222222,0.333333,0.222222,0.111111,0.444444,0.000000,0.555556,0.444444,1.000000,0.000000,0.000000,0.777778,0.222222,0.111111,0.555556,0.333333,0.000000,0.0,0.444444,0.428571,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
4,0.0,0.6,0.2,0.111111,0.444444,0.2,0.444444,0.777778,0.142857,0.222222,0.222222,0.444444,0.444444,0.555556,0.444444,0.000000,0.000000,1.0,0.444444,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.444444,0.555556,0.666667,0.222222,0.111111,0.555556,0.444444,0.000000,0.000000,1.000000,0.000000,0.0,0.666667,0.285714,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.750,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9817,0.0,0.2,0.6,0.000000,0.777778,0.4,0.000000,0.555556,0.285714,0.222222,0.222222,0.666667,0.222222,0.000000,0.333333,0.666667,0.555556,0.0,0.000000,0.111111,0.000000,0.444444,0.222222,0.000000,0.222222,0.444444,0.222222,0.444444,0.555556,0.444444,0.444444,0.222222,0.333333,0.666667,0.333333,0.666667,0.000000,0.000000,0.0,0.222222,0.285714,0.666667,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.375,0.0,0.0,0.0,0.000000,0.0,0.5,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
9818,0.0,0.2,0.4,0.111111,0.555556,0.2,0.333333,0.444444,0.285714,0.444444,0.444444,0.444444,0.222222,0.222222,0.444444,0.444444,0.222222,0.0,0.000000,0.333333,0.333333,0.333333,0.111111,0.111111,0.222222,0.555556,0.111111,0.777778,0.222222,0.666667,0.000000,0.333333,0.777778,0.222222,0.333333,0.333333,0.222222,0.222222,0.0,0.444444,0.142857,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.500,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
9819,0.0,0.2,0.4,0.111111,0.555556,0.2,0.333333,0.777778,0.000000,0.222222,0.222222,0.555556,0.333333,0.222222,0.333333,0.444444,0.222222,0.0,0.000000,0.333333,0.444444,0.222222,0.111111,0.111111,0.333333,0.555556,0.000000,0.777778,0.222222,0.666667,0.111111,0.222222,0.666667,0.333333,0.222222,0.555556,0.333333,0.000000,0.0,0.444444,0.285714,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.375,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.5,0.0
9820,0.0,0.4,0.4,0.111111,0.444444,0.4,0.333333,0.777778,0.142857,0.222222,0.222222,0.333333,0.444444,0.111111,0.333333,0.555556,0.111111,0.2,0.111111,0.222222,0.333333,0.333333,0.222222,0.222222,0.222222,0.444444,0.111111,0.444444,0.555556,0.666667,0.222222,0.222222,0.777778,0.222222,0.111111,0.444444,0.444444,0.111111,0.0,0.444444,0.285714,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [70]:
categorical_cols = ['MOSTYPE','MOSHOOFD']
df = pd.get_dummies(df, columns = categorical_cols)


In [71]:
df.columns

Index(['ORIGIN', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MGODRK', 'MGODPR',
       'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA',
       ...
       'MOSHOOFD_1', 'MOSHOOFD_2', 'MOSHOOFD_3', 'MOSHOOFD_4', 'MOSHOOFD_5',
       'MOSHOOFD_6', 'MOSHOOFD_7', 'MOSHOOFD_8', 'MOSHOOFD_9', 'MOSHOOFD_10'],
      dtype='object', length=135)

In [72]:
uuids = np.array([uuid4() for _ in range(len(df))])
df['UUID'] = uuids
df['UUID'].nunique()

9822

In [73]:
pd.crosstab(df.ORIGIN,df.CARAVAN)

CARAVAN,0,1
ORIGIN,Unnamed: 1_level_1,Unnamed: 2_level_1
test,3762,238
train,5474,348


In [85]:
demographic_col = [col for col in df if col.startswith('M')]
demographic_col.append('UUID')
demographic_col.append('ORIGIN')
len(demographic_col)


demograhic_df = df[demographic_col]

display(demograhic_df)
demograhic_df.to_csv("/ssd003/projects/pets/datasets/demograhic_info.csv",index=False)

93

Unnamed: 0,MAANTHUI,MGEMOMV,MGEMLEEF,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,MOSTYPE_1,MOSTYPE_2,MOSTYPE_3,MOSTYPE_4,MOSTYPE_5,MOSTYPE_6,MOSTYPE_7,MOSTYPE_8,MOSTYPE_9,MOSTYPE_10,MOSTYPE_11,MOSTYPE_12,MOSTYPE_13,MOSTYPE_15,MOSTYPE_16,MOSTYPE_17,MOSTYPE_18,MOSTYPE_19,MOSTYPE_20,MOSTYPE_21,MOSTYPE_22,MOSTYPE_23,MOSTYPE_24,MOSTYPE_25,MOSTYPE_26,MOSTYPE_27,MOSTYPE_28,MOSTYPE_29,MOSTYPE_30,MOSTYPE_31,MOSTYPE_32,MOSTYPE_33,MOSTYPE_34,MOSTYPE_35,MOSTYPE_36,MOSTYPE_37,MOSTYPE_38,MOSTYPE_39,MOSTYPE_40,MOSTYPE_41,MOSHOOFD_1,MOSHOOFD_2,MOSHOOFD_3,MOSHOOFD_4,MOSHOOFD_5,MOSHOOFD_6,MOSHOOFD_7,MOSHOOFD_8,MOSHOOFD_9,MOSHOOFD_10,UUID,ORIGIN
0,0.0,0.4,0.2,0.000000,0.555556,0.2,0.333333,0.777778,0.000000,0.222222,0.111111,0.222222,0.666667,0.111111,0.222222,0.777778,0.111111,0.0,0.111111,0.222222,0.555556,0.222222,0.111111,0.111111,0.222222,0.666667,0.111111,0.111111,0.888889,0.888889,0.000000,0.111111,0.888889,0.111111,0.000000,0.444444,0.555556,0.000000,0.0,0.444444,0.285714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,cb8db3d9-e0c2-4f2c-a866-d579a1d61ff4,train
1,0.0,0.2,0.2,0.111111,0.444444,0.2,0.444444,0.666667,0.285714,0.222222,0.000000,0.444444,0.555556,0.000000,0.555556,0.444444,0.000000,0.0,0.000000,0.555556,0.000000,0.444444,0.000000,0.222222,0.333333,0.555556,0.000000,0.222222,0.777778,0.777778,0.111111,0.222222,0.666667,0.333333,0.222222,0.000000,0.555556,0.222222,0.0,0.555556,0.428571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,eed33082-7621-473d-b255-c5d35198627a,train
2,0.0,0.2,0.2,0.000000,0.444444,0.4,0.444444,0.333333,0.285714,0.444444,0.444444,0.444444,0.222222,0.000000,0.555556,0.444444,0.000000,0.0,0.000000,0.777778,0.000000,0.222222,0.000000,0.555556,0.000000,0.444444,0.000000,0.777778,0.222222,0.777778,0.000000,0.222222,1.000000,0.000000,0.444444,0.555556,0.000000,0.000000,0.0,0.333333,0.428571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,7950c7c4-4003-44d0-88dc-b05e6437afd4,train
3,0.0,0.4,0.4,0.222222,0.333333,0.4,0.444444,0.555556,0.285714,0.222222,0.222222,0.333333,0.444444,0.333333,0.444444,0.222222,0.444444,0.0,0.000000,0.333333,0.111111,0.222222,0.333333,0.222222,0.111111,0.444444,0.000000,0.555556,0.444444,1.000000,0.000000,0.000000,0.777778,0.222222,0.111111,0.555556,0.333333,0.000000,0.0,0.444444,0.428571,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,6b807aa1-88d8-4ab0-ad1b-a8ef051f91e1,train
4,0.0,0.6,0.2,0.111111,0.444444,0.2,0.444444,0.777778,0.142857,0.222222,0.222222,0.444444,0.444444,0.555556,0.444444,0.000000,0.000000,1.0,0.444444,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.444444,0.555556,0.666667,0.222222,0.111111,0.555556,0.444444,0.000000,0.000000,1.000000,0.000000,0.0,0.666667,0.285714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,c1100358-d07d-4d70-aad0-49ace40eadda,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9817,0.0,0.2,0.6,0.000000,0.777778,0.4,0.000000,0.555556,0.285714,0.222222,0.222222,0.666667,0.222222,0.000000,0.333333,0.666667,0.555556,0.0,0.000000,0.111111,0.000000,0.444444,0.222222,0.000000,0.222222,0.444444,0.222222,0.444444,0.555556,0.444444,0.444444,0.222222,0.333333,0.666667,0.333333,0.666667,0.000000,0.000000,0.0,0.222222,0.285714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,595a0f62-a4d6-4a8c-9aaf-7750b711b184,test
9818,0.0,0.2,0.4,0.111111,0.555556,0.2,0.333333,0.444444,0.285714,0.444444,0.444444,0.444444,0.222222,0.222222,0.444444,0.444444,0.222222,0.0,0.000000,0.333333,0.333333,0.333333,0.111111,0.111111,0.222222,0.555556,0.111111,0.777778,0.222222,0.666667,0.000000,0.333333,0.777778,0.222222,0.333333,0.333333,0.222222,0.222222,0.0,0.444444,0.142857,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,ea5a8164-30e8-424a-a735-3455213d126d,test
9819,0.0,0.2,0.4,0.111111,0.555556,0.2,0.333333,0.777778,0.000000,0.222222,0.222222,0.555556,0.333333,0.222222,0.333333,0.444444,0.222222,0.0,0.000000,0.333333,0.444444,0.222222,0.111111,0.111111,0.333333,0.555556,0.000000,0.777778,0.222222,0.666667,0.111111,0.222222,0.666667,0.333333,0.222222,0.555556,0.333333,0.000000,0.0,0.444444,0.285714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,a15974b6-050b-4df2-b5a9-79008012d8a4,test
9820,0.0,0.4,0.4,0.111111,0.444444,0.4,0.333333,0.777778,0.142857,0.222222,0.222222,0.333333,0.444444,0.111111,0.333333,0.555556,0.111111,0.2,0.111111,0.222222,0.333333,0.333333,0.222222,0.222222,0.222222,0.444444,0.111111,0.444444,0.555556,0.666667,0.222222,0.222222,0.777778,0.222222,0.111111,0.444444,0.444444,0.111111,0.0,0.444444,0.285714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,6c98d35f-fe7b-4ba7-bebd-f0ac97013ebf,test


In [86]:
prdct_insurance_stats_col = [col for col in df if (col.startswith('A') or col.startswith('P'))]
prdct_insurance_stats_col.append('UUID')
prdct_insurance_stats_col.append('ORIGIN')
prdct_insurance_stats_col.append('CARAVAN')
len(prdct_insurance_stats_col)

prdct_insurance_stats_df = df[prdct_insurance_stats_col]
display(prdct_insurance_stats_df)
prdct_insurance_stats_df.to_csv("/ssd003/projects/pets/datasets/prdct_insurance_stats_info.csv",index=False)

45

Unnamed: 0,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,UUID,ORIGIN,CARAVAN
0,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.625,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,cb8db3d9-e0c2-4f2c-a866-d579a1d61ff4,train,0
1,0.666667,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,eed33082-7621-473d-b255-c5d35198627a,train,0
2,0.666667,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250,0.0,0.0,0.0,0.000000,0.0,0.5,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,7950c7c4-4003-44d0-88dc-b05e6437afd4,train,0
3,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.250,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,6b807aa1-88d8-4ab0-ad1b-a8ef051f91e1,train,0
4,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.750,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,c1100358-d07d-4d70-aad0-49ace40eadda,train,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9817,0.666667,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.375,0.0,0.0,0.0,0.000000,0.0,0.5,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,595a0f62-a4d6-4a8c-9aaf-7750b711b184,test,0
9818,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.500,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,ea5a8164-30e8-424a-a735-3455213d126d,test,1
9819,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.375,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.5,0.0,a15974b6-050b-4df2-b5a9-79008012d8a4,test,0
9820,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,6c98d35f-fe7b-4ba7-bebd-f0ac97013ebf,test,0


In [87]:
dmg = "/ssd003/projects/pets/datasets/demograhic_info.csv"
ins = "/ssd003/projects/pets/datasets/prdct_insurance_stats_info.csv"
df_demographic = pd.read_csv(dmg)
df_insurance = pd.read_csv(ins)

In [88]:
df_demographic.head(3)
df_insurance.head(3)

Unnamed: 0,MAANTHUI,MGEMOMV,MGEMLEEF,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,MOSTYPE_1,MOSTYPE_2,MOSTYPE_3,MOSTYPE_4,MOSTYPE_5,MOSTYPE_6,MOSTYPE_7,MOSTYPE_8,MOSTYPE_9,MOSTYPE_10,MOSTYPE_11,MOSTYPE_12,MOSTYPE_13,MOSTYPE_15,MOSTYPE_16,MOSTYPE_17,MOSTYPE_18,MOSTYPE_19,MOSTYPE_20,MOSTYPE_21,MOSTYPE_22,MOSTYPE_23,MOSTYPE_24,MOSTYPE_25,MOSTYPE_26,MOSTYPE_27,MOSTYPE_28,MOSTYPE_29,MOSTYPE_30,MOSTYPE_31,MOSTYPE_32,MOSTYPE_33,MOSTYPE_34,MOSTYPE_35,MOSTYPE_36,MOSTYPE_37,MOSTYPE_38,MOSTYPE_39,MOSTYPE_40,MOSTYPE_41,MOSHOOFD_1,MOSHOOFD_2,MOSHOOFD_3,MOSHOOFD_4,MOSHOOFD_5,MOSHOOFD_6,MOSHOOFD_7,MOSHOOFD_8,MOSHOOFD_9,MOSHOOFD_10,UUID,ORIGIN
0,0.0,0.4,0.2,0.0,0.555556,0.2,0.333333,0.777778,0.0,0.222222,0.111111,0.222222,0.666667,0.111111,0.222222,0.777778,0.111111,0.0,0.111111,0.222222,0.555556,0.222222,0.111111,0.111111,0.222222,0.666667,0.111111,0.111111,0.888889,0.888889,0.0,0.111111,0.888889,0.111111,0.0,0.444444,0.555556,0.0,0.0,0.444444,0.285714,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,cb8db3d9-e0c2-4f2c-a866-d579a1d61ff4,train
1,0.0,0.2,0.2,0.111111,0.444444,0.2,0.444444,0.666667,0.285714,0.222222,0.0,0.444444,0.555556,0.0,0.555556,0.444444,0.0,0.0,0.0,0.555556,0.0,0.444444,0.0,0.222222,0.333333,0.555556,0.0,0.222222,0.777778,0.777778,0.111111,0.222222,0.666667,0.333333,0.222222,0.0,0.555556,0.222222,0.0,0.555556,0.428571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,eed33082-7621-473d-b255-c5d35198627a,train
2,0.0,0.2,0.2,0.0,0.444444,0.4,0.444444,0.333333,0.285714,0.444444,0.444444,0.444444,0.222222,0.0,0.555556,0.444444,0.0,0.0,0.0,0.777778,0.0,0.222222,0.0,0.555556,0.0,0.444444,0.0,0.777778,0.222222,0.777778,0.0,0.222222,1.0,0.0,0.444444,0.555556,0.0,0.0,0.0,0.333333,0.428571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,7950c7c4-4003-44d0-88dc-b05e6437afd4,train


Unnamed: 0,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,UUID,ORIGIN,CARAVAN
0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,cb8db3d9-e0c2-4f2c-a866-d579a1d61ff4,train,0
1,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,eed33082-7621-473d-b255-c5d35198627a,train,0
2,0.666667,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,7950c7c4-4003-44d0-88dc-b05e6437afd4,train,0
