In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Restrict minor warnings
import warnings
warnings.filterwarnings('ignore')

# to display all outputs of one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = 100

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer as CTT
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

from numpy.random import choice

In [2]:
fn = "/ssd003/projects/pets/datasets/caravan-insurance-challenge.csv"
df = pd.read_csv(fn)

In [3]:
df.shape

(9822, 87)

In [4]:
train = df[df['ORIGIN']=='train']
val = df[df['ORIGIN']=='test']

_ = train.pop('ORIGIN')
_ = val.pop('ORIGIN')

X_train = train
X_val = val
y_train = train.pop('CARAVAN')
y_val = val.pop('CARAVAN')

In [5]:
X_train.shape, y_train.shape

((5822, 85), (5822,))

In [6]:
X_val.shape, y_val.shape

((4000, 85), (4000,))

In [None]:
df.columns[df.isna().sum()>0]

In [7]:
pd.crosstab(df.ORIGIN,df.CARAVAN)

CARAVAN,0,1
ORIGIN,Unnamed: 1_level_1,Unnamed: 2_level_1
test,3762,238
train,5474,348


In [8]:
pd.DataFrame(df.nunique()).T

Unnamed: 0,ORIGIN,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,MINK123M,MINKGEM,MKOOPKLA,PWAPART,PWABEDR,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,2,40,9,6,6,10,10,10,6,10,10,8,10,10,10,10,10,10,10,10,6,10,10,10,10,10,10,10,10,10,10,10,10,9,10,10,10,10,10,10,10,9,10,8,4,7,5,7,4,6,5,6,6,6,6,10,7,3,5,9,4,7,2,7,5,3,3,2,9,6,5,5,4,7,6,4,7,2,2,3,8,2,3,5,3,3,2


In [22]:
cat_col = ['MOSTYPE','MOSHOOFD']
num_cols = list(X_train.columns.values)
len(num_cols)

85

In [13]:
X_train.columns

Index(['MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD', 'MGODRK',
       'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV', 'MFALLEEN',
       'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG', 'MBERHOOG',
       'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO', 'MSKA',
       'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT2',
       'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045', 'MINK4575',
       'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PWABEDR',
       'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT', 'PAANHANG',
       'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG', 'PGEZONG',
       'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS', 'PINBOED',
       'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT', 'ABESAUT',
       'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT', 'ABROM',
       'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND', 'AZEILPL',
       'APLEZIER', 'AFIETS',

In [76]:
pipe = Pipeline([
    ('ct', CTT([
        ('ss', MinMaxScaler(), num_cols),
        ('ohe', OneHotEncoder(), cat_col)
    ], remainder='passthrough')),
    ('e', MLPClassifier(hidden_layer_sizes=(5, 40), random_state=0))
])

In [77]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ss', MinMaxScaler(),
                                                  ['MOSTYPE', 'MAANTHUI',
                                                   'MGEMOMV', 'MGEMLEEF',
                                                   'MOSHOOFD', 'MGODRK',
                                                   'MGODPR', 'MGODOV', 'MGODGE',
                                                   'MRELGE', 'MRELSA', 'MRELOV',
                                                   'MFALLEEN', 'MFGEKIND',
                                                   'MFWEKIND', 'MOPLHOOG',
                                                   'MOPLMIDD', 'MOPLLAAG',
                                                   'MBERHOOG', 'MBERZELF',
                                                   'MBERBOER', 'MBERMIDD',
                                                   'MBERARBG', 'MBERARBO',
            

In [78]:
f1_score(y_val, pipe.predict(X_val))

0.12779552715654954

In [84]:
y_val.value_counts(normalize=True).values

array([0.9405, 0.0595])

In [90]:
random_draw = choice([0, 1], len(y_val),
              p=y_val.value_counts(normalize=True).values)

In [91]:
f1_score(y_val, random_draw)

0.07407407407407408