# Approaching Categorical Variables

In [1]:
import pandas as pd 
df = pd.read_csv("cat_train.csv")
df.sample(10)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
42861,42861,0.0,1.0,0.0,F,N,Blue,Circle,Hamster,Costa Rica,...,db98264ef,1.0,Novice,,e,X,MV,6.0,2.0,0
258103,258103,0.0,0.0,0.0,T,Y,Red,Trapezoid,Axolotl,Finland,...,0d5573e59,1.0,Novice,Boiling Hot,e,U,th,4.0,3.0,0
592454,592454,0.0,0.0,0.0,F,Y,Red,Circle,Lion,India,...,004b4670d,2.0,Expert,Warm,k,Q,oh,5.0,1.0,0
539892,539892,0.0,0.0,0.0,F,Y,Blue,Square,Lion,Russia,...,0d5573e59,3.0,,Cold,j,H,WW,5.0,8.0,1
313607,313607,0.0,0.0,1.0,F,N,Red,Polygon,Hamster,Costa Rica,...,a8ac5fac2,2.0,Contributor,Lava Hot,f,Y,DT,1.0,1.0,1
114748,114748,0.0,0.0,0.0,F,Y,Red,Circle,Axolotl,India,...,9fbf93590,2.0,Master,Warm,c,T,oh,,3.0,0
467565,467565,0.0,0.0,0.0,F,Y,Blue,Trapezoid,Lion,Costa Rica,...,836203022,1.0,Grandmaster,Freezing,e,Y,fF,7.0,7.0,0
555169,555169,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Finland,...,266a69792,1.0,Expert,Cold,a,K,RT,5.0,7.0,0
272394,272394,0.0,0.0,0.0,F,Y,Blue,Circle,Axolotl,Costa Rica,...,b29c5465a,3.0,Novice,Freezing,n,P,Hk,3.0,5.0,0
397322,397322,0.0,0.0,0.0,F,,Red,Triangle,Axolotl,,...,ea8b5bac0,1.0,Contributor,Cold,k,C,JT,1.0,4.0,0


In [2]:
# View unique values in column ord_2
df['ord_2'].unique()

array(['Hot', 'Warm', 'Freezing', 'Lava Hot', 'Cold', 'Boiling Hot', nan],
      dtype=object)

In [3]:
# View the value counts before mapping
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

### Most important ways to handle categorical variables
1. Label Encoding
2. Sparse Matrices
3. One Hot Encoding

### Label Encoding

In [4]:
mapping = {
    "Freezing": 0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5
}

In [5]:
# Label Encoding - Convert categories to numbers
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

# View the value counts after mapping
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [6]:
# Label Encoding using scikit-learn
import pandas as pd 
from sklearn import preprocessing

# read the data
df = pd.read_csv("cat_train.csv")

# fill the NaN values in ord_2 column (because LabelEncoder of scikit-learn does not handle NaN values)
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# P.S.: do not use this directly, fit first and then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,3,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,6,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,2,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,4,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,1,h,C,OZ,5.0,12.0,0


### Sparse Matrices

In [7]:
# size of a numpy array
import numpy as np

# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# print size in bytes
print(example.nbytes)

36


In [8]:
# convert numpy array to sparse matrix
import numpy as np
from scipy import sparse

# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(sparse_example.data.nbytes)

16


In [9]:
# total size of sparse csr matrix
print(
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

48


In [10]:
# Comparison of sizes of dense and sparse arrays
import numpy as np 
from scipy import sparse 

# number of rows
n_rows = 10000

# number of columns
n_cols = 100000

# create random binary matrix with only 5% values as 1s
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 4000000000
Size of sparse array: 200000148
Full size of sparse array: 400040300


### One Hot Encoding

In [11]:
# Comparison of sizes of dense and binarized arrays
import numpy as np 
from scipy import sparse 

# create binary matrix
example = np.array(
    [
        [0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]
    ]
)

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 72
Size of sparse array: 12
Full size of sparse array: 40


In [12]:
# Use scikit-learn's OneHotEncoder to transform a much larger feature array with 1001 categories
import numpy as np 
from sklearn import preprocessing

# create random 1D array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")

# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get sparse array
ohe = preprocessing.OneHotEncoder(sparse=True)

# fit and transform data with sparse one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size in bytes for sparse array
print(f"Size of sparse array: {ohe_example.data.nbytes}")

full_size = (
    ohe_example.data.nbytes +
    ohe_example.indptr.nbytes +
    ohe_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 8000000000
Size of sparse array: 8000000
Full size of sparse array: 16000004


### Other ways to handle categorical variables:

#### - Converting categorical variables to numerical variables

In [13]:
# shape of dataframe where 'ord_2' column has value 'Boiling Hot'
import pandas as pd 
df = pd.read_csv("cat_train.csv")
df[df.ord_2 == 'Boiling Hot'].shape

(84790, 25)

In [14]:
# calculate above with pandas groupby function
df.groupby("ord_2")["id"].count()

ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
Warm           124239
Name: id, dtype: int64

In [15]:
# replace ord_2 column with its count values to convert it to numerical column (using transform function of pandas)
df.groupby("ord_2")["id"].transform("count")

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

In [16]:
# we can also group by multiple columns and their counts
df.groupby(
    [
        "ord_1",
        "ord_2"
    ]
)["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,Boiling Hot,15634
1,Contributor,Cold,17734
2,Contributor,Freezing,26082
3,Contributor,Hot,12428
4,Contributor,Lava Hot,11919
5,Contributor,Warm,22774
6,Expert,Boiling Hot,19477
7,Expert,Cold,22956
8,Expert,Freezing,33249
9,Expert,Hot,15792


#### - Create new categorical features from the existing ones

In [17]:
# concatenating names of ord_1 and ord_2 columns using an underscore
df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
)
df.new_feature

0                 Contributor_Hot
1                Grandmaster_Warm
2                    nan_Freezing
3                 Novice_Lava Hot
4                Grandmaster_Cold
                   ...           
599995            Novice_Freezing
599996         Novice_Boiling Hot
599997       Contributor_Freezing
599998                Master_Warm
599999    Contributor_Boiling Hot
Name: new_feature, Length: 600000, dtype: object

In [18]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,new_feature
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,Contributor_Hot
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,Grandmaster_Warm
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,3.0,,Freezing,n,P,eN,5.0,9.0,0,nan_Freezing
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0,Novice_Lava Hot
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,Grandmaster_Cold


In [19]:
# we can also combine three or four or even more features
df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
    + "_"
    + df.ord_3.astype(str)
)
df.new_feature

0                 Contributor_Hot_c
1                Grandmaster_Warm_e
2                    nan_Freezing_n
3                 Novice_Lava Hot_a
4                Grandmaster_Cold_h
                    ...            
599995            Novice_Freezing_a
599996         Novice_Boiling Hot_n
599997       Contributor_Freezing_n
599998                Master_Warm_m
599999    Contributor_Boiling Hot_b
Name: new_feature, Length: 600000, dtype: object

### Handling NaN values

#### 1. Drop them

In [20]:
# Load the data
df = pd.read_csv("cat_train.csv")

# View the number of NaN values
df.isna().sum()

id            0
bin_0     17894
bin_1     18003
bin_2     17930
bin_3     18014
bin_4     18047
nom_0     18252
nom_1     18156
nom_2     18035
nom_3     18121
nom_4     18035
nom_5     17778
nom_6     18131
nom_7     18003
nom_8     17755
nom_9     18073
ord_0     18288
ord_1     18041
ord_2     18075
ord_3     17916
ord_4     17930
ord_5     17713
day       17952
month     17988
target        0
dtype: int64

In [21]:
# drop NaN values
df.dropna(inplace=True)

# View number of NaN values
df.isna().sum() 

id        0
bin_0     0
bin_1     0
bin_2     0
bin_3     0
bin_4     0
nom_0     0
nom_1     0
nom_2     0
nom_3     0
nom_4     0
nom_5     0
nom_6     0
nom_7     0
nom_8     0
nom_9     0
ord_0     0
ord_1     0
ord_2     0
ord_3     0
ord_4     0
ord_5     0
day       0
month     0
target    0
dtype: int64

#### 2. Treat NaN values as a completely new category

In [22]:
# Load the data
df = pd.read_csv("cat_train.csv")

# View the value counts of ord_2 column
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [23]:
# fill NaNs with None
df.ord_2.fillna("NONE").value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [24]:
# Label Encoding the train and test data
import pandas as pd 
from sklearn import preprocessing

# read training data
train = pd.read_csv("cat_train.csv")

# read test data
test = pd.read_csv("cat_test.csv")

# create a fake target column for test data
# since this column doesn't exist
test.loc[:, "target"] = -1

# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)

# make a list of features we are interested in 
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id","target"]]

# loop over the feature list
for feat in features:
    # create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()
    
    # note the trick here
    # since its categorical data, we fillna with a string
    # and we convert all the data to string type
    # so, no matter its int or float, its converted to string
    # int/float but categorical !!!
    temp_col = data[feat].fillna("NONE").astype(str).values
    
    # we can use fit_transform here as we do not have any extra test data that we need to transform on separately
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
    
# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [25]:
# View train data
train.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
528156,528156,0,0,0,0,2,0,6,5,6,...,1708,1,2,2,2,25,159,4,5,0
420837,420837,1,0,0,0,2,3,5,0,6,...,1998,3,5,2,11,2,46,7,3,0
113725,113725,0,0,0,2,2,3,5,0,4,...,1002,1,0,4,3,21,22,6,7,0
149574,149574,0,0,1,0,0,3,2,6,3,...,179,0,1,6,2,15,21,5,3,0
63070,63070,1,0,0,2,2,3,0,3,6,...,1216,0,1,2,14,23,135,6,5,0


In [26]:
# View test data
test.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
17163,617163,0,1,1,0,2,2,5,0,6,...,614,1,0,2,9,13,113,3,3,-1
276786,876786,0,0,1,2,0,0,0,2,2,...,705,0,1,4,14,10,182,4,10,-1
70615,670615,0,0,0,0,2,1,2,3,6,...,1965,2,5,2,13,4,76,0,0,-1
31183,631183,0,0,0,2,0,0,2,3,2,...,1802,0,1,3,8,8,155,1,3,-1
68979,668979,0,0,0,2,2,3,0,3,3,...,1008,1,1,6,13,25,72,6,5,-1


In [27]:
# View categories in ord_4 column
df.ord_4.fillna("NONE").value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

- We define a value `rare` if it is less than 2000

In [28]:
df.ord_4 = df.ord_4.fillna("NONE")

df.loc[
    df["ord_4"].value_counts()[df["ord_4"]].values < 2000, "ord_4"
] = "RARE"

df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64

## Build a model

### `create_folds.py`

In [29]:
# create_folds.py
import pandas as pd 
from sklearn import model_selection

if __name__ == "__main__":
    
    # read training data
    df = pd.read_csv("cat_train.csv")
    
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1 
    
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch labels
    y = df.target.values
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for f,(t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
        
    # save the new csv with kfold column
    df.to_csv("cat_train_folds.csv", index=False)

In [30]:
df.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,kfold
397030,377488,,1.0,1.0,F,N,Blue,Circle,Dog,Costa Rica,...,2.0,Expert,Freezing,i,F,DI,1.0,3.0,0,3
441118,64968,0.0,0.0,0.0,F,Y,Blue,Polygon,Snake,India,...,1.0,Novice,Cold,c,G,qN,1.0,12.0,0,3
483088,217612,0.0,0.0,0.0,T,N,Blue,Square,Hamster,Finland,...,2.0,,Hot,o,K,Hk,5.0,5.0,0,4
224420,351617,0.0,1.0,1.0,T,N,Blue,Triangle,Axolotl,India,...,1.0,Contributor,Freezing,h,C,Ty,5.0,6.0,0,1
35462,563802,0.0,1.0,0.0,T,N,Blue,Triangle,Dog,Finland,...,3.0,,Cold,m,P,,6.0,3.0,0,0


In [31]:
# Check our new folds csv to see the number of samples per fold
import pandas as pd 

df = pd.read_csv("cat_train_folds.csv")

df.kfold.value_counts()

4    120000
3    120000
2    120000
1    120000
0    120000
Name: kfold, dtype: int64

In [32]:
# Check target distribution per fold
df[df.kfold==0].target.value_counts()

0    97536
1    22464
Name: target, dtype: int64

In [33]:
df[df.kfold==1].target.value_counts()

0    97536
1    22464
Name: target, dtype: int64

In [34]:
df[df.kfold==2].target.value_counts()

0    97535
1    22465
Name: target, dtype: int64

In [35]:
df[df.kfold==3].target.value_counts()

0    97535
1    22465
Name: target, dtype: int64

In [36]:
df[df.kfold==4].target.value_counts()

0    97535
1    22465
Name: target, dtype: int64

### *Simple Model*: One Hot Encoding + Logistic Regression

### `ohe_logres.py`

In [37]:
# ohe_logres.py

import pandas as pd 

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(auc)

if __name__ == "__main__":
    # run function for fold = 0
    # we can just replace this number and run this for any fold
    run(0)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7876915259114206


In [38]:
# Change the code to run it for all folds
# ohe_logres.py

import warnings
warnings.filterwarnings("ignore")

import pandas as pd 

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.7876915259114206
Fold = 1, AUC = 0.7854353428375456
Fold = 2, AUC = 0.7871695062046414
Fold = 3, AUC = 0.7849085782477077
Fold = 4, AUC = 0.7856094501096818


### *Another Model*: Label Encoding + Random Forest

### `lbl_rf.py`

In [39]:
# lbl_rf.py

import pandas as pd 

from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize Random Forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # fit model on training data
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.7165689631333727
Fold = 1, AUC = 0.7142999617684446
Fold = 2, AUC = 0.7180122804792258
Fold = 3, AUC = 0.7164325180579998
Fold = 4, AUC = 0.7161191295548788


### *Another model*: One Hot Encoding + fitting TruncatedSVD on sparse matrix with training + validation data

### `ohe_svd_rf.py`

In [40]:
# ohe_svd_rf.py

# lbl_rf.py

import pandas as pd 

from scipy import sparse
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])

    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Truncated SVD
    # we are reducing the data to 120 components
    svd = decomposition.TruncatedSVD(n_components=120)
    
    # fit SVD on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)

    # transform sparse training data
    x_train = svd.transform(x_train)
    
    # transform sparse validation data
    x_valid = svd.transform(x_valid)
    
    # initialize Random Forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.7086326418328672
Fold = 1, AUC = 0.7052239907671172
Fold = 2, AUC = 0.7081827246386388
Fold = 3, AUC = 0.7067629228294052
Fold = 4, AUC = 0.7095767554254209


### *Another Model*: Label Encoding + XGBoost

### `lbl_xgb.py`

In [41]:
# lbl_rf.py

import pandas as pd 

import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.7623592364471694
Fold = 1, AUC = 0.7614065212034647
Fold = 2, AUC = 0.7627550141479342
Fold = 3, AUC = 0.760773772581606
Fold = 4, AUC = 0.7634291002113744


### Now, change the dataset and use US Adult Census Data

In [47]:
import pandas as pd 

df = pd.read_csv("adult.csv")

df.income.value_counts() 

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [48]:
df.income.value_counts(normalize=True) 

<=50K    0.75919
>50K     0.24081
Name: income, dtype: float64

- About 24% of the total number of samples have income greater than 50K USD.
- So, we will use AUC as our metric

In [49]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


### Step 1: Create Folds

### `create_folds.py`

In [50]:
# create_folds.py
import pandas as pd 
from sklearn import model_selection

if __name__ == "__main__":
    
    # read training data
    df = pd.read_csv("adult.csv")
    
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1 
    
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch labels
    y = df.income.values
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for f,(t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
        
    # save the new csv with kfold column
    df.to_csv("adult_folds.csv", index=False)

In [51]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,kfold
0,39,Private,237943,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,99999,0,70,United-States,>50K,0
1,46,Federal-gov,199925,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
2,17,Private,175465,10th,6,Never-married,Other-service,Own-child,White,Female,0,0,14,United-States,<=50K,0
3,21,Private,118712,Assoc-voc,11,Never-married,Craft-repair,Own-child,White,Male,0,1504,40,United-States,<=50K,0
4,35,Private,215323,Assoc-voc,11,Divorced,Other-service,Unmarried,White,Female,0,0,35,United-States,<=50K,0


### *Simple Model*: One Hot Encoding + Logistic Regression

### `ohe_logres.py`

In [56]:
# ohe_logres.py

import pandas as pd 

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("adult_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "income", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)   

Fold = 0, AUC = 0.9281433250622395
Fold = 1, AUC = 0.9274763189559805
Fold = 2, AUC = 0.9257037235403871
Fold = 3, AUC = 0.9215594763390793
Fold = 4, AUC = 0.9199870823632852


### *Another Model*: Label Encoding + XGBoost

### `lbl_xgb.py`

In [57]:
# lbl_xgb.py

import pandas as pd 
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
    # drop numerical columns
    df = df.drop(num_cols, axis=1)
    
    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns except kfold & income columns
    features = [f for f in df.columns if f not in ("kfold", "income")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
        
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.8823870046883284
Fold = 1, AUC = 0.8735180965590119
Fold = 2, AUC = 0.8769075894921075
Fold = 3, AUC = 0.8682344296942077
Fold = 4, AUC = 0.8705387456863813


In [58]:
# Increase max_depth to 7 and n_estimators to 200
# lbl_xgb.py

import pandas as pd 
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
    # drop numerical columns
    df = df.drop(num_cols, axis=1)
    
    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns except kfold & income columns
    features = [f for f in df.columns if f not in ("kfold", "income")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
        
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.8770177550064868
Fold = 1, AUC = 0.8658947043623276
Fold = 2, AUC = 0.8690370413942277
Fold = 3, AUC = 0.8602539771976753
Fold = 4, AUC = 0.8621421528259362


In [59]:
# Now, we include the numerical columns in the XGBoost model
# lbl_xgb_num.py

import pandas as pd 
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns except kfold & income columns
    features = [f for f in df.columns if f not in ("kfold", "income")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        if col not in num_cols:

            # initialize LabelEncoder for each column
            lbl = preprocessing.LabelEncoder()

            # fit label encoder on all data
            lbl.fit(df[col])

            # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
        
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.9280679750877128
Fold = 1, AUC = 0.9262547289850407
Fold = 2, AUC = 0.926675319703124
Fold = 3, AUC = 0.9236557866471831
Fold = 4, AUC = 0.9222465723243182


### Now, we take all the categorical columns and create all combinations of degree two.

In [62]:
# lbl_xgb_num_feat.py
import itertools
import pandas as pd 
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def feature_engineering(df, cat_cols):
    """
    This function is used for feature engineering
    :param df: the pandas dataframe with train/test data
    :param cat-cols: list if categorical columns
    :return: dataframe with new features
    """
    # this will create all 2-combinations of values in this list
    # for example:
    # list(itertools.combinations([1,2,3], 2)) will return
    # [(1,2), (1,3), (2,3)]
    combi = list(itertools.combinations(cat_cols, 2))
    for c1, c2 in combi:
        df.loc[:, c1 + "_" + c2] = df[c1].astype(str) + "_" + df[c2].astype(str)
    
    return df


def run(fold):
    # load the full training data with folds
    df = pd.read_csv("adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # list of categorical columns for feature engineering 
    cat_cols = [c for c in df.columns if c not in num_cols and c not in ("kfold", "income")]
    
    # add new features
    df = feature_engineering(df, cat_cols)
    
    # all columns are features except kfold & income columns
    features = [f for f in df.columns if f not in ("kfold", "income")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        if col not in num_cols:

            # initialize LabelEncoder for each column
            lbl = preprocessing.LabelEncoder()

            # fit label encoder on all data
            lbl.fit(df[col])

            # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
        
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.9280909861577779
Fold = 1, AUC = 0.9258767068720691
Fold = 2, AUC = 0.9256503192903375
Fold = 3, AUC = 0.923145479368932
Fold = 4, AUC = 0.9215265179190607


- We get a little improvement

In [63]:
# Increase max_depth to 7 
# lbl_xgb_num_feat.py
import itertools
import pandas as pd 
import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def feature_engineering(df, cat_cols):
    """
    This function is used for feature engineering
    :param df: the pandas dataframe with train/test data
    :param cat-cols: list if categorical columns
    :return: dataframe with new features
    """
    # this will create all 2-combinations of values in this list
    # for example:
    # list(itertools.combinations([1,2,3], 2)) will return
    # [(1,2), (1,3), (2,3)]
    combi = list(itertools.combinations(cat_cols, 2))
    for c1, c2 in combi:
        df.loc[:, c1 + "_" + c2] = df[c1].astype(str) + "_" + df[c2].astype(str)
    
    return df


def run(fold):
    # load the full training data with folds
    df = pd.read_csv("adult_folds.csv")
    
    # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # list of categorical columns for feature engineering 
    cat_cols = [c for c in df.columns if c not in num_cols and c not in ("kfold", "income")]
    
    # add new features
    df = feature_engineering(df, cat_cols)
    
    # all columns are features except kfold & income columns
    features = [f for f in df.columns if f not in ("kfold", "income")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        if col not in num_cols:

            # initialize LabelEncoder for each column
            lbl = preprocessing.LabelEncoder()

            # fit label encoder on all data
            lbl.fit(df[col])

            # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
        
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.9237966821775458
Fold = 1, AUC = 0.9263750175434251
Fold = 2, AUC = 0.9232032049773793
Fold = 3, AUC = 0.9226309152301697
Fold = 4, AUC = 0.9182498962873985


- We have improved the model

### Target Encoding + XGBoost

### `target_encoding.py`

In [64]:
# target_encoding.py

import copy
import pandas as pd 

from sklearn import metrics
from sklearn import preprocessing
import xgboost as xgb 

def mean_target_encoding(data):
    
    # make a copy of dataframe
    df = copy.deepcopy(data)
    
    # list of numerical columns
    num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
    ]
    
    # map targets to 0s and 1s
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # all columns are features except kfold & income columns
    features = [f for f in df.columns if f not in ("kfold", "income") and f not in num_cols]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        # do not encode the numerical columns
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        if col not in num_cols:

            # initialize LabelEncoder for each column
            lbl = preprocessing.LabelEncoder()

            # fit label encoder on all data
            lbl.fit(df[col])

            # transform all the data
            df.loc[:, col] = lbl.transform(df[col])
            
    # a list to store 5 validation features
    encoded_dfs = []
    
    # go over all folds
    for fold in range(5):
        # fetch training and validation data
        df_train = df[df.kfold != fold].reset_index(drop=True)
        df_valid = df[df.kfold == fold].reset_index(drop=True)
        
        # for all feature columns i.e. categorical columns
        for column in features:
            # create dict of category: mean target
            mapping_dict = dict(
            df_train.groupby(column)["income"].mean()
            )
            # column_enc is the new column we have with mean encoding
            df_valid.loc[:, column + "_enc"] = df_valid[column].map(mapping_dict)
        # append to our list of encoded validation dataframes
        encoded_dfs.append(df_valid)
    # create full data frame again and return
    encoded_df = pd.concat(encoded_dfs, axis=0)
    return encoded_df

def run(df, fold):
    # note that folds are same as before        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # all columns ar features except income and kfold columns
    features = [f for f in df.columns if f not in ("kfold", "income")]
    
    # scale training data
    x_train = df_train[features].values
    
    # scale validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.income.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
        
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    # read data
    df = pd.read_csv("adult_folds.csv")
    
    # create mean target encoded categories and munge data
    df = mean_target_encoding(df)
    
    # run training and validation for 5 folds
    for fold_ in range(5):
        run(df, fold_)    

Fold = 0, AUC = 0.9256790779483561
Fold = 1, AUC = 0.9260629767683775
Fold = 2, AUC = 0.9250695803199921
Fold = 3, AUC = 0.9229308304025493
Fold = 4, AUC = 0.9196802659170464


- We have improved again

### Entity Embeddings

### `entity_embeddings.py`

In [69]:
# entity_embeddings.py
import os 
import gc 
import joblib
import pandas as pd 
import numpy as np 

from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

def create_model(data, catcols):
    """
    This function returns a compiled tf.keras model for entity embeddings
    :param data: this is pandas dataframe
    :param catcols: list of categorical column names
    :return: compiled tf.keras model
    """
    
    # init list of inputs for embeddings
    inputs = []
    
    # init list of outputs for embeddings
    outputs = []
    
    # loop over all categorical columns
    for c in catcols:
        # find the number of unique values in the column
        num_unique_values = int(data[c].nunique())
        
        # simple dimension of embedding calculator
        # min size is half of the number of unique values
        # max size is 50. max size depends on the number of unique categories too. 50 is quite sufficient most of the times
        # but if you have millions of unique values, you might need a larger dimension
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        
        # simple keras input layer with size 1
        inp = layers.Input(shape=(1,))
        
        # add embedding layer to raw input
        # embedding size is always 1 more than unique values in input
        out = layers.Embedding(
            num_unique_values + 1, embed_dim, name=c
        )(inp)
        
        # 1D spatial dropout is the standard for embedding layers you can use it in NLP tasks too
        out = layers.SpatialDropout1D(0.3)(out)
        
        # reshape the input to the dimension of embedding this becomes our output layer for current feature
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        
        # add input to input list
        inputs.append(inp)
        
        # add output to output list
        outputs.append(out)
        
    # concatenate all output layers
    x = layers.Concatenate()(outputs)
        
    # add a batchnorm layer.
    # from here, everything is up to you
    # you can try different architectures
    # this is the architecture I like to use
    # if you have numerical features, you should add them or in concatenate layer
    x = layers.BatchNormalization()(x)
        
    # a bunch of dense layers with dropout.
    # start with 1 or two layers only
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    # using softmax and treating it as a two class problem 
    # you can also use sigmoid, then you need to use only one output class
    y = layers.Dense(2, activation="softmax")(x)

    # create final model
    model = Model(inputs=inputs, outputs=y)

    # compile the model
    # we use adam and binary cross entropy
    # feel free to use something else and see how model behaves
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model 

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # encode all features with label encoder individually
    # in a live setting you need to save all label encoders
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)
        
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # create tf.keras model
    model = create_model(df, features)
    
    # our features are lists of lists
    xtrain = [
        df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
        df_valid[features].values[:, k] for k in range(len(features))
    ]
    
    # fetch target columns
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    # convert target columns to categories 
    # this is just binarization
    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
              ytrain_cat,
              validation_data=(xvalid, yvalid_cat),
              verbose=1,
              batch_size=1024,
              epochs=3
             )
    
    # generate validation predictions
    valid_preds = model.predict(xvalid)[:, 1]
    
    # print roc auc score
    print(metrics.roc_auc_score(yvalid, valid_preds))
    
    # clear session to free up some GPU memory
    K.clear_session()

if __name__ == "__main__":
    run(0)
    run(1)
    run(2)
    run(3)
    run(4)

Epoch 1/3
Epoch 2/3
Epoch 3/3
0.7869503440759662
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.7841058954388263
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.7856915935750822
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.7840908747841048
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.7842464506597762
