In [1]:
#################################### AllState Claims Severity ############################################################

# Below is function to encode categorical variables with high cardinality into numeric values such that they can 
# used in modeling exercises. The technique has been inspired from Owen Zhang's method of dealing with categorical variables
# with high cardinality


# Reading in training and test data

import pandas as pd
import numpy as np
import matplotlib as plt
%pylab inline
df_train = pd.read_csv("C:/Users/HP/Desktop/Kaggle/All State Severity Claims/train.csv", index_col='id')
df_test = pd.read_csv("C:/Users/HP/Desktop/Kaggle/All State Severity Claims/test.csv", index_col='id')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
# Getting all continuous features into a separate dataset

contfeatures = df_train.select_dtypes(include=["float64"])

In [5]:
# Getting all categorical features into a separate dataset
catfeatures = df_train.select_dtypes(include=["object"])


In [6]:
catfeatures_list = list(catfeatures)


In [7]:
# We can possibly feed categorical variables with less or eq 10 levels direclty into our model.
# But, cat variables with >10 levels have to be feature engineered so that their effects can be included into the model
catvarbs_10 = list((df_train[catfeatures_list].apply(pd.Series.nunique)>10))

catvarlist = []
for (i, v) in zip(catfeatures_list, catvarbs_10):
    if(v):
        catvarlist.append(i)

In [8]:
print(catvarlist)

['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116']


In [9]:
# WE append 'loss' variable to the cat varb dataset to compute means and variance

catvarlist.append('loss')
df_cat_encod = df_train[catvarlist]
df_cat_encod.head(5)

Unnamed: 0_level_0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,loss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,T,B,G,A,I,E,G,J,G,BU,BC,C,AS,S,A,O,LB,2213.18
2,T,L,F,A,E,E,I,K,K,BI,CQ,A,AV,BM,A,O,DP,1283.6
5,D,L,O,B,E,F,H,F,A,AB,DK,A,C,AF,A,I,GK,3005.09
10,T,I,D,A,E,E,I,K,K,BI,CS,C,N,AE,A,O,DJ,939.85
11,P,F,J,A,D,E,K,G,B,H,C,C,Y,BM,A,K,CK,2763.85


In [10]:
#before running our function to encode, we need to ensure that the list of char variables which we pass to the function
#does not the 'loss' variable in it

catvarlist.remove('loss')
catvarlist
target=['loss']

In [11]:
df_cat_encod.head(5)

Unnamed: 0_level_0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,loss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,T,B,G,A,I,E,G,J,G,BU,BC,C,AS,S,A,O,LB,2213.18
2,T,L,F,A,E,E,I,K,K,BI,CQ,A,AV,BM,A,O,DP,1283.6
5,D,L,O,B,E,F,H,F,A,AB,DK,A,C,AF,A,I,GK,3005.09
10,T,I,D,A,E,E,I,K,K,BI,CS,C,N,AE,A,O,DJ,939.85
11,P,F,J,A,D,E,K,G,B,H,C,C,Y,BM,A,K,CK,2763.85


In [12]:
# We define a function which will flatten a multi index column names which are created after aggregation of data
# This will be useful after creating mean & standard dev of categorical variable levels


def flattenHierarchicalCol(col,sep = ','):
    if not type(col) is tuple:
        return col
    else:
        new_col = ''
        for leveli,level in enumerate(col):
            if not level == '':
                if not leveli == 0:
                    new_col += sep
                new_col += level
        return new_col

In [13]:
# The function below computes the mean and std dev of the target variable across each level of each categorical variable
# identified and creates two separate features. This can instead be used as a continuous feature in any models we build
# We add the std dev too so as to introduce some random variation/noise into the data
def cat_encoding(list, dataframe, target):
    for i in range(len(list)):
        group_df = dataframe.groupby([list[i]], as_index=False).agg({target:{"mean"+list[i]:'mean', 
                                                                    "stdev"+list[i]:'std'}})
        dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
    
    dataframe.columns = dataframe.columns.map(flattenHierarchicalCol)
    return dataframe

In [14]:
cat_encoded = cat_encoding(catvarlist,df_cat_encod,target[0])

  self.right = self.right.drop(right_drop, axis=1)


In [16]:
cat_encoded.head(5)

# Mean and std dev of all categorical variables identified have been computed and returned as a separate dataset which can be joined
# to our original training set. The same mean & std dev values can be used to transform the same variables in the test set

Unnamed: 0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,...,"loss,stdevcat112","loss,meancat112","loss,meancat113","loss,stdevcat113","loss,stdevcat114","loss,meancat114","loss,stdevcat115","loss,meancat115","loss,stdevcat116","loss,meancat116"
0,T,B,G,A,I,E,G,J,G,BU,...,3040.851201,3409.983392,2744.910924,2475.689837,3005.511481,3259.916396,2699.909989,2948.525441,2545.417315,2917.5232
1,T,L,F,A,E,E,I,K,K,BI,...,2349.891371,2380.8509,2874.471697,2581.380533,3005.511481,3259.916396,2699.909989,2948.525441,2837.734327,3107.697517
2,D,L,O,B,E,F,H,F,A,AB,...,3124.220044,3250.374479,2942.861834,2506.936938,3005.511481,3259.916396,3208.069463,2991.019257,2131.3728,2695.767964
3,T,I,D,A,E,E,I,K,K,BI,...,2846.254422,3043.584628,2907.740038,2558.986877,3005.511481,3259.916396,2699.909989,2948.525441,2562.341762,2911.900687
4,P,F,J,A,D,E,K,G,B,H,...,2447.238321,2837.032428,2874.471697,2581.380533,3005.511481,3259.916396,2741.299127,3016.756872,2445.806699,3037.328947


In [18]:
names = cat_encoded.columns
names

Index(['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106',
       'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113',
       'cat114', 'cat115', 'cat116', 'loss', 'loss,stdevcat99',
       'loss,meancat99', 'loss,stdevcat100', 'loss,meancat100',
       'loss,stdevcat101', 'loss,meancat101', 'loss,meancat103',
       'loss,stdevcat103', 'loss,stdevcat104', 'loss,meancat104',
       'loss,stdevcat105', 'loss,meancat105', 'loss,meancat106',
       'loss,stdevcat106', 'loss,stdevcat107', 'loss,meancat107',
       'loss,meancat108', 'loss,stdevcat108', 'loss,stdevcat109',
       'loss,meancat109', 'loss,stdevcat110', 'loss,meancat110',
       'loss,stdevcat111', 'loss,meancat111', 'loss,stdevcat112',
       'loss,meancat112', 'loss,meancat113', 'loss,stdevcat113',
       'loss,stdevcat114', 'loss,meancat114', 'loss,stdevcat115',
       'loss,meancat115', 'loss,stdevcat116', 'loss,meancat116'],
      dtype='object')

In [20]:
del cat_encoded['loss']

In [21]:
# Removing the word 'loss' from the left of the newly created columns

cat_encoded.rename(columns = lambda x: x.replace('loss,',''), inplace=True)

In [23]:
cat_encoded.columns

Index(['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106',
       'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113',
       'cat114', 'cat115', 'cat116', 'stdevcat99', 'meancat99', 'stdevcat100',
       'meancat100', 'stdevcat101', 'meancat101', 'meancat103', 'stdevcat103',
       'stdevcat104', 'meancat104', 'stdevcat105', 'meancat105', 'meancat106',
       'stdevcat106', 'stdevcat107', 'meancat107', 'meancat108', 'stdevcat108',
       'stdevcat109', 'meancat109', 'stdevcat110', 'meancat110', 'stdevcat111',
       'meancat111', 'stdevcat112', 'meancat112', 'meancat113', 'stdevcat113',
       'stdevcat114', 'meancat114', 'stdevcat115', 'meancat115', 'stdevcat116',
       'meancat116'],
      dtype='object')

In [30]:
# Taking the same categorical variables we encoded in train set from test set

cat_encod_test = df_test[catvarlist]
cat_encod_test.head(5)

Unnamed: 0_level_0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
4,T,H,G,A,G,E,I,L,K,BI,BC,A,J,AX,A,Q,HG
6,P,B,D,A,G,G,G,F,B,BI,CO,E,G,X,A,L,HK
9,D,G,Q,D,D,E,J,G,A,BI,CS,C,U,AE,A,K,CK
12,T,G,A,D,E,E,I,K,K,BI,CR,A,AY,AJ,A,P,DJ
15,P,A,A,A,F,E,G,E,B,AB,EG,A,E,I,C,J,HA


In [31]:
cat_encod_test = cat_encod_test.reset_index()


In [32]:
del cat_encod_test['id']

In [44]:
cat_encoded.head(5)

Unnamed: 0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,...,stdevcat112,meancat112,meancat113,stdevcat113,stdevcat114,meancat114,stdevcat115,meancat115,stdevcat116,meancat116
0,T,B,G,A,I,E,G,J,G,BU,...,3040.851201,3409.983392,2744.910924,2475.689837,3005.511481,3259.916396,2699.909989,2948.525441,2545.417315,2917.5232
1,T,L,F,A,E,E,I,K,K,BI,...,2349.891371,2380.8509,2874.471697,2581.380533,3005.511481,3259.916396,2699.909989,2948.525441,2837.734327,3107.697517
2,D,L,O,B,E,F,H,F,A,AB,...,3124.220044,3250.374479,2942.861834,2506.936938,3005.511481,3259.916396,3208.069463,2991.019257,2131.3728,2695.767964
3,T,I,D,A,E,E,I,K,K,BI,...,2846.254422,3043.584628,2907.740038,2558.986877,3005.511481,3259.916396,2699.909989,2948.525441,2562.341762,2911.900687
4,P,F,J,A,D,E,K,G,B,H,...,2447.238321,2837.032428,2874.471697,2581.380533,3005.511481,3259.916396,2741.299127,3016.756872,2445.806699,3037.328947


In [45]:
cat_encoded2 = cat_encoded

In [46]:
cat_encoded2 = cat_encoded2.drop(cat_encoded2[catvarlist],axis=1)


In [48]:
cat_encoded2.head(5)

Unnamed: 0,stdevcat99,meancat99,stdevcat100,meancat100,stdevcat101,meancat101,meancat103,stdevcat103,stdevcat104,meancat104,...,stdevcat112,meancat112,meancat113,stdevcat113,stdevcat114,meancat114,stdevcat115,meancat115,stdevcat116,meancat116
0,2932.563972,3067.992359,2583.974161,3090.589334,2779.116912,3450.680947,2814.648335,2730.730664,3360.386503,3014.904158,...,3040.851201,3409.983392,2744.910924,2475.689837,3005.511481,3259.916396,2699.909989,2948.525441,2545.417315,2917.5232
1,2932.563972,3067.992359,3048.822161,4005.581714,2976.537533,3560.151861,2814.648335,2730.730664,2645.879582,2981.080942,...,2349.891371,2380.8509,2874.471697,2581.380533,3005.511481,3259.916396,2699.909989,2948.525441,2837.734327,3107.697517
2,3379.862054,3403.895737,3048.822161,4005.581714,4215.629488,6870.387172,3078.89028,2956.262034,2645.879582,2981.080942,...,3124.220044,3250.374479,2942.861834,2506.936938,3005.511481,3259.916396,3208.069463,2991.019257,2131.3728,2695.767964
3,2932.563972,3067.992359,1720.6643,1970.402509,2711.813767,2812.990306,2814.648335,2730.730664,2645.879582,2981.080942,...,2846.254422,3043.584628,2907.740038,2558.986877,3005.511481,3259.916396,2699.909989,2948.525441,2562.341762,2911.900687
4,2772.785165,2993.899862,2942.289968,3200.09894,3272.064591,4603.86379,2814.648335,2730.730664,2465.145125,2970.460095,...,2447.238321,2837.032428,2874.471697,2581.380533,3005.511481,3259.916396,2741.299127,3016.756872,2445.806699,3037.328947


In [49]:
onlystdev = cat_encoded2.filter(like='stdev', axis=1)
onlystdev.head(5)

Unnamed: 0,stdevcat99,stdevcat100,stdevcat101,stdevcat103,stdevcat104,stdevcat105,stdevcat106,stdevcat107,stdevcat108,stdevcat109,stdevcat110,stdevcat111,stdevcat112,stdevcat113,stdevcat114,stdevcat115,stdevcat116
0,2932.563972,2583.974161,2779.116912,2730.730664,3360.386503,2440.20316,2741.305845,3018.081009,2827.247746,2102.608022,2151.528924,3046.972088,3040.851201,2475.689837,3005.511481,2699.909989,2545.417315
1,2932.563972,3048.822161,2976.537533,2730.730664,2645.879582,2440.20316,2440.269065,2781.060283,2713.847698,3076.370083,3032.288518,2709.744506,2349.891371,2581.380533,3005.511481,2699.909989,2837.734327
2,3379.862054,3048.822161,4215.629488,2956.262034,2645.879582,2881.047093,2565.659203,2923.395751,2537.371903,1900.096355,2537.056515,2709.744506,3124.220044,2506.936938,3005.511481,3208.069463,2131.3728
3,2932.563972,1720.6643,2711.813767,2730.730664,2645.879582,2440.20316,2440.269065,2781.060283,2713.847698,3076.370083,2850.364645,3046.972088,2846.254422,2558.986877,3005.511481,2699.909989,2562.341762
4,2772.785165,2942.289968,3272.064591,2730.730664,2465.145125,2440.20316,2098.58023,2833.424198,2416.835931,1029.312229,1699.755843,3046.972088,2447.238321,2581.380533,3005.511481,2741.299127,2445.806699


In [50]:
stdev_names  = onlystdev.columns

In [51]:
onlymean = cat_encoded2.filter(like='mean', axis=1)
mean_names = onlymean.columns
mean_names

Index(['meancat99', 'meancat100', 'meancat101', 'meancat103', 'meancat104',
       'meancat105', 'meancat106', 'meancat107', 'meancat108', 'meancat109',
       'meancat110', 'meancat111', 'meancat112', 'meancat113', 'meancat114',
       'meancat115', 'meancat116'],
      dtype='object')

In [52]:
stdev_names.sort
mean_names.sort

<bound method Index.sort of Index(['meancat99', 'meancat100', 'meancat101', 'meancat103', 'meancat104',
       'meancat105', 'meancat106', 'meancat107', 'meancat108', 'meancat109',
       'meancat110', 'meancat111', 'meancat112', 'meancat113', 'meancat114',
       'meancat115', 'meancat116'],
      dtype='object')>

In [53]:
# Getting a dictionary based on training set encoding and mapping the same encoding to our test dataset


for i in range(len(catvarlist)):
    mydict = dict(zip(cat_encoded[catvarlist[i]], cat_encoded[mean_names[i]]))
    cat_encod_test[mean_names[i]] = cat_encod_test[catvarlist[i]].map(mydict)
    mydict2 = dict(zip(cat_encoded[catvarlist[i]], cat_encoded[stdev_names[i]]))
    cat_encod_test[stdev_names[i]] = cat_encod_test[catvarlist[i]].map(mydict2)


In [54]:
cat_encod_test.head(5)

Unnamed: 0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,...,meancat112,stdevcat112,meancat113,stdevcat113,meancat114,stdevcat114,meancat115,stdevcat115,meancat116,stdevcat116
0,T,H,G,A,G,E,I,L,K,BI,...,2832.797108,3145.599768,3095.98886,2758.65092,3259.916396,3005.511481,3091.162019,2866.499638,2709.464662,2334.144596
1,P,B,D,A,G,G,G,F,B,BI,...,3250.231455,2763.894153,2987.115282,2662.0811,3259.916396,3005.511481,3054.095207,2626.677779,3125.668896,3019.687107
2,D,G,Q,D,D,E,J,G,A,BI,...,2594.619455,2568.184866,2907.740038,2558.986877,3259.916396,3005.511481,3016.756872,2741.299127,3037.328947,2445.806699
3,T,G,A,D,E,E,I,K,K,BI,...,2787.467666,2389.969241,2923.313806,2619.0894,3259.916396,3005.511481,3025.422366,2875.953575,2911.900687,2562.341762
4,P,A,A,A,F,E,G,E,B,AB,...,3208.40928,3151.852258,2454.817139,2227.727317,2070.594386,2237.659507,3140.75002,3212.084563,1995.420482,1296.275266
