This is a breakdown and understanding of the implementation of Joe Eddy solution to [Kaggle’s Safe Driver Prediction Challenge](https://www.kaggle.com/aquatic/entity-embedding-neural-net/data)

In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from tensorflow.keras.layers import Embedding, concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras import models

from sklearn.model_selection import StratifiedKFold


## Data Description
In this competition, you will predict the probability that an auto insurance policy holder files a claim.

In the train and test data, features that belong to similar groupings are tagged as such in the feature names (e.g., **ind, reg, car, calc**). In addition, feature names include the postfix bin to indicate binary features and cat to indicate categorical features. Features without these designations are either continuous or ordinal. Values of -1 indicate that the feature was missing from the observation. The target columns signifies whether or not a claim was filed for that policy holder.

In [2]:
#Data loading & preprocessing
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

X_train, y_train = df_train.iloc[:,2:], df_train.target
X_test = df_test.iloc[:,1:]

In [3]:
X_train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

To save time for data preprocessing, let's focus on the features not tagged **_calc_**.

In [4]:
cols_use = [c for c in X_train.columns if (not c.startswith('ps_calc_'))]

X_train = X_train[cols_use]
X_test = X_test[cols_use]

In [5]:
X_train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15'],
      dtype='object')

## Data preprocessing - categoricals

Now, let's split columns into categoricals and others. First of all, we will create a dictionary to save the names of categorical columns.

In [6]:
col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns if c.endswith('_cat')}
col_vals_dict

{'ps_ind_02_cat': [2, 1, 4, 3, -1],
 'ps_ind_04_cat': [1, 0, -1],
 'ps_ind_05_cat': [0, 1, 4, 3, 6, 5, -1, 2],
 'ps_car_01_cat': [10, 11, 7, 6, 9, 5, 4, 8, 3, 0, 2, 1, -1],
 'ps_car_02_cat': [1, 0, -1],
 'ps_car_03_cat': [-1, 0, 1],
 'ps_car_04_cat': [0, 1, 8, 9, 2, 6, 3, 7, 4, 5],
 'ps_car_05_cat': [1, -1, 0],
 'ps_car_06_cat': [4,
  11,
  14,
  13,
  6,
  15,
  3,
  0,
  1,
  10,
  12,
  9,
  17,
  7,
  8,
  5,
  2,
  16],
 'ps_car_07_cat': [1, -1, 0],
 'ps_car_08_cat': [0, 1],
 'ps_car_09_cat': [0, 2, 3, 1, -1, 4],
 'ps_car_10_cat': [1, 0, 2],
 'ps_car_11_cat': [12,
  19,
  60,
  104,
  82,
  99,
  30,
  68,
  20,
  36,
  101,
  103,
  41,
  59,
  43,
  64,
  29,
  95,
  24,
  5,
  28,
  87,
  66,
  10,
  26,
  54,
  32,
  38,
  83,
  89,
  49,
  93,
  1,
  22,
  85,
  78,
  31,
  34,
  7,
  8,
  3,
  46,
  27,
  25,
  61,
  16,
  69,
  40,
  76,
  39,
  88,
  42,
  75,
  91,
  23,
  2,
  71,
  90,
  80,
  44,
  92,
  72,
  96,
  86,
  62,
  33,
  67,
  73,
  77,
  18,
  21,
  74,
 

In [7]:
#look at value counts to know the embedding dimensions
embed_cols = []

for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c])) 
print('\n')

ps_ind_02_cat: 5 values
ps_ind_04_cat: 3 values
ps_ind_05_cat: 8 values
ps_car_01_cat: 13 values
ps_car_02_cat: 3 values
ps_car_03_cat: 3 values
ps_car_04_cat: 10 values
ps_car_05_cat: 3 values
ps_car_06_cat: 18 values
ps_car_07_cat: 3 values
ps_car_09_cat: 6 values
ps_car_10_cat: 3 values
ps_car_11_cat: 104 values




In the code below, for each of the categorical variables present in the data-set we are defining a embedding model. The embedding size is set according to the rules given in the class. We reshape the model output to a single 1-D array of size = embedding size.

In [8]:
inputs = []
embeddings = []

input_ps_ind_02_cat = Input(shape=(1,))
embedding = Embedding(5, 3, input_length=1)(input_ps_ind_02_cat)
embedding = Reshape(target_shape=(3,))(embedding)
inputs.append(input_ps_ind_02_cat)
embeddings.append(embedding)

input_ps_ind_04_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_ind_04_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_ind_04_cat)
embeddings.append(embedding)

input_ps_ind_05_cat = Input(shape=(1,))
embedding = Embedding(8, 5, input_length=1)(input_ps_ind_05_cat)
embedding = Reshape(target_shape=(5,))(embedding)
inputs.append(input_ps_ind_05_cat)
embeddings.append(embedding)

input_ps_car_01_cat = Input(shape=(1,))
embedding = Embedding(13, 7, input_length=1)(input_ps_car_01_cat)
embedding = Reshape(target_shape=(7,))(embedding)
inputs.append(input_ps_car_01_cat)
embeddings.append(embedding)

input_ps_car_02_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_02_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_02_cat)
embeddings.append(embedding)

input_ps_car_03_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_03_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_03_cat)
embeddings.append(embedding)

input_ps_car_04_cat = Input(shape=(1,))
embedding = Embedding(10, 5, input_length=1)(input_ps_car_04_cat)
embedding = Reshape(target_shape=(5,))(embedding)
inputs.append(input_ps_car_04_cat)
embeddings.append(embedding)

input_ps_car_05_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_05_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_05_cat)
embeddings.append(embedding)

input_ps_car_06_cat = Input(shape=(1,))
embedding = Embedding(18, 8, input_length=1)(input_ps_car_06_cat)
embedding = Reshape(target_shape=(8,))(embedding)
inputs.append(input_ps_car_06_cat)
embeddings.append(embedding)

input_ps_car_07_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_07_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_07_cat)
embeddings.append(embedding)

input_ps_car_09_cat = Input(shape=(1,))
embedding = Embedding(6, 3, input_length=1)(input_ps_car_09_cat)
embedding = Reshape(target_shape=(3,))(embedding)
inputs.append(input_ps_car_09_cat)
embeddings.append(embedding)

input_ps_car_10_cat = Input(shape=(1,))
embedding = Embedding(3, 2, input_length=1)(input_ps_car_10_cat)
embedding = Reshape(target_shape=(2,))(embedding)
inputs.append(input_ps_car_10_cat)
embeddings.append(embedding)

input_ps_car_11_cat = Input(shape=(1,))
embedding = Embedding(104, 10, input_length=1)(input_ps_car_11_cat)
embedding = Reshape(target_shape=(10,))(embedding)
inputs.append(input_ps_car_11_cat)
embeddings.append(embedding)

In [9]:
embeddings

[<tf.Tensor 'reshape/Identity:0' shape=(None, 3) dtype=float32>,
 <tf.Tensor 'reshape_1/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_2/Identity:0' shape=(None, 5) dtype=float32>,
 <tf.Tensor 'reshape_3/Identity:0' shape=(None, 7) dtype=float32>,
 <tf.Tensor 'reshape_4/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_5/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_6/Identity:0' shape=(None, 5) dtype=float32>,
 <tf.Tensor 'reshape_7/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_8/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'reshape_9/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_10/Identity:0' shape=(None, 3) dtype=float32>,
 <tf.Tensor 'reshape_11/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_12/Identity:0' shape=(None, 10) dtype=float32>]

In [10]:
inputs

[<tf.Tensor 'input_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_13:0' shape=(None, 1) dtype=float32>]

## Data preprocessing - numericals

### Check missing values

In [11]:
X_train.isnull().sum()

ps_ind_01        0
ps_ind_02_cat    0
ps_ind_03        0
ps_ind_04_cat    0
ps_ind_05_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
ps_ind_09_bin    0
ps_ind_10_bin    0
ps_ind_11_bin    0
ps_ind_12_bin    0
ps_ind_13_bin    0
ps_ind_14        0
ps_ind_15        0
ps_ind_16_bin    0
ps_ind_17_bin    0
ps_ind_18_bin    0
ps_reg_01        0
ps_reg_02        0
ps_reg_03        0
ps_car_01_cat    0
ps_car_02_cat    0
ps_car_03_cat    0
ps_car_04_cat    0
ps_car_05_cat    0
ps_car_06_cat    0
ps_car_07_cat    0
ps_car_08_cat    0
ps_car_09_cat    0
ps_car_10_cat    0
ps_car_11_cat    0
ps_car_11        0
ps_car_12        0
ps_car_13        0
ps_car_14        0
ps_car_15        0
dtype: int64

### $\Omega$ Practice: rescale all numerical features 

In [12]:
#### please insert your code here ####




## Model

In [13]:
X_train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15'],
      dtype='object')

In [14]:
other_cols = [c for c in X_train.columns if (not c in embed_cols)]
len(other_cols)

24

In [15]:
input_numeric = Input(shape=(24,))
embedding_numeric = Dense(16)(input_numeric) 

inputs.append(input_numeric)
embeddings.append(embedding_numeric)

In [16]:
inputs

[<tf.Tensor 'input_1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_13:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_14:0' shape=(None, 24) dtype=float32>]

In [24]:
embeddings

[<tf.Tensor 'reshape/Identity:0' shape=(None, 3) dtype=float32>,
 <tf.Tensor 'reshape_1/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_2/Identity:0' shape=(None, 5) dtype=float32>,
 <tf.Tensor 'reshape_3/Identity:0' shape=(None, 7) dtype=float32>,
 <tf.Tensor 'reshape_4/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_5/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_6/Identity:0' shape=(None, 5) dtype=float32>,
 <tf.Tensor 'reshape_7/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_8/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'reshape_9/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_10/Identity:0' shape=(None, 3) dtype=float32>,
 <tf.Tensor 'reshape_11/Identity:0' shape=(None, 2) dtype=float32>,
 <tf.Tensor 'reshape_12/Identity:0' shape=(None, 10) dtype=float32>,
 <tf.Tensor 'dense/Identity:0' shape=(None, 16) dtype=float32>]

## MLP model

In [17]:
x = Concatenate()(embeddings)
x = Dense(80, activation='relu')(x)
x = Dropout(.35)(x)
x = Dense(20, activation='relu')(x)
x = Dropout(.15)(x)
x = Dense(10, activation='relu')(x)
x = Dropout(.15)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs, output)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
______________________________________________________________________________________________

Convert data to list format to match the network structure. Map unique values into index.

In [19]:
input_list_train = []

for c in embed_cols:
    raw_vals = np.unique(X_train[c])
    val_map = {}
    for i in range(len(raw_vals)):
        val_map[raw_vals[i]] = i       
    input_list_train.append(X_train[c].map(val_map).values)

In [20]:
np.unique(input_list_train[2])

array([0, 1, 2, 3, 4, 5, 6, 7])

Let's package it into a function

In [21]:
def preproc(X_train):

    input_list_train = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
     
    #the rest of the columns
    other_cols = [c for c in X_train.columns if (not c in embed_cols)]
    input_list_train.append(X_train[other_cols].values)
    
    return input_list_train

In [22]:
proc_X_train_f = preproc(X_train)
proc_X_train_f

[array([2, 1, 4, ..., 1, 2, 1]),
 array([2, 1, 2, ..., 1, 2, 1]),
 array([1, 1, 1, ..., 1, 1, 1]),
 array([11, 12,  8, ...,  8, 12,  8]),
 array([2, 2, 2, ..., 2, 2, 1]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([2, 0, 0, ..., 0, 0, 0]),
 array([ 4, 11, 14, ...,  1, 11,  0]),
 array([2, 2, 2, ..., 2, 2, 2]),
 array([1, 3, 3, ..., 3, 3, 3]),
 array([1, 1, 1, ..., 1, 1, 1]),
 array([ 11,  18,  59, ...,  30, 100,  33]),
 array([[ 2.        ,  5.        ,  0.        , ...,  0.88367892,
          0.37080992,  3.60555128],
        [ 1.        ,  7.        ,  0.        , ...,  0.61881652,
          0.38871583,  2.44948974],
        [ 5.        ,  9.        ,  0.        , ...,  0.64158572,
          0.34727511,  3.31662479],
        ...,
        [ 1.        , 10.        ,  1.        , ...,  0.59637334,
          0.39874804,  1.73205081],
        [ 5.        ,  3.        ,  0.        , ...,  0.76443411,
          0.38496753,  3.16227766],
        [ 0.        ,  8

In [23]:
model.fit(proc_X_train_f, y_train.values,epochs=3, batch_size=4096)

W0824 08:34:44.864946 140735802639232 deprecation.py:323] From /Users/jodie/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 595212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1a4619f910>