# Approaching Categorical Variables

In [1]:
import pandas as pd 
df = pd.read_csv("cat_train.csv")
df.sample(10)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
372143,372143,1.0,0.0,0.0,F,N,Blue,Triangle,Dog,Canada,...,a12077f95,1.0,Master,Freezing,o,Q,pB,6.0,5.0,0
14439,14439,0.0,1.0,1.0,F,Y,Red,Polygon,Hamster,Finland,...,fdf9297f6,2.0,Contributor,Warm,m,T,Nh,5.0,11.0,0
68841,68841,0.0,0.0,0.0,T,N,Blue,Square,Hamster,China,...,30db15878,3.0,Expert,Warm,m,,ly,6.0,6.0,0
314450,314450,0.0,0.0,0.0,F,N,Blue,Polygon,Lion,China,...,e74d93bc8,2.0,Novice,Freezing,o,,xG,3.0,7.0,1
345795,345795,0.0,0.0,0.0,F,N,Blue,Circle,Hamster,Russia,...,ede9f9f4d,3.0,Novice,Hot,b,,mP,1.0,12.0,0
141243,141243,0.0,0.0,0.0,F,N,Red,Triangle,Lion,India,...,2a27c8fde,3.0,Contributor,Hot,b,T,HK,6.0,12.0,0
153986,153986,0.0,0.0,0.0,T,Y,Red,Circle,Hamster,Costa Rica,...,580e3caf1,1.0,Grandmaster,Warm,e,E,AI,3.0,3.0,0
66024,66024,1.0,0.0,0.0,F,Y,Red,Trapezoid,Hamster,Russia,...,de54260a1,1.0,Master,Warm,m,A,mD,5.0,8.0,0
568940,568940,1.0,0.0,,T,N,Red,Circle,Hamster,Finland,...,5cb7f1328,1.0,Contributor,Freezing,n,A,gL,4.0,12.0,0
361962,361962,0.0,0.0,0.0,T,N,Red,Square,Axolotl,Russia,...,7d9542273,2.0,Expert,Freezing,b,B,oU,6.0,12.0,0


In [2]:
# View unique values in column ord_2
df['ord_2'].unique()

array(['Hot', 'Warm', 'Freezing', 'Lava Hot', 'Cold', 'Boiling Hot', nan],
      dtype=object)

In [3]:
# View the value counts before mapping
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

### Most important ways to handle categorical variables
1. Label Encoding
2. Sparse Matrices
3. One Hot Encoding

### Label Encoding

In [4]:
mapping = {
    "Freezing": 0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5
}

In [5]:
# Label Encoding - Convert categories to numbers
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

# View the value counts after mapping
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [6]:
# Label Encoding using scikit-learn
import pandas as pd 
from sklearn import preprocessing

# read the data
df = pd.read_csv("cat_train.csv")

# fill the NaN values in ord_2 column (because LabelEncoder of scikit-learn does not handle NaN values)
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# P.S.: do not use this directly, fit first and then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,3,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,6,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,2,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,4,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,1,h,C,OZ,5.0,12.0,0


### Sparse Matrices

In [7]:
# size of a numpy array
import numpy as np

# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# print size in bytes
print(example.nbytes)

36


In [8]:
# convert numpy array to sparse matrix
import numpy as np
from scipy import sparse

# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(sparse_example.data.nbytes)

16


In [9]:
# total size of sparse csr matrix
print(
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

48


In [10]:
# Comparison of sizes of dense and sparse arrays
import numpy as np 
from scipy import sparse 

# number of rows
n_rows = 10000

# number of columns
n_cols = 100000

# create random binary matrix with only 5% values as 1s
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 4000000000
Size of sparse array: 200001588
Full size of sparse array: 400043180


### One Hot Encoding

In [11]:
# Comparison of sizes of dense and binarized arrays
import numpy as np 
from scipy import sparse 

# create binary matrix
example = np.array(
    [
        [0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]
    ]
)

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 72
Size of sparse array: 12
Full size of sparse array: 40


In [12]:
# Use scikit-learn's OneHotEncoder to transform a much larger feature array with 1001 categories
import numpy as np 
from sklearn import preprocessing

# create random 1D array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")

# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get sparse array
ohe = preprocessing.OneHotEncoder(sparse=True)

# fit and transform data with sparse one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size in bytes for sparse array
print(f"Size of sparse array: {ohe_example.data.nbytes}")

full_size = (
    ohe_example.data.nbytes +
    ohe_example.indptr.nbytes +
    ohe_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 8000000000
Size of sparse array: 8000000
Full size of sparse array: 16000004


### Other ways to handle categorical variables:

#### - Converting categorical variables to numerical variables

In [13]:
# shape of dataframe where 'ord_2' column has value 'Boiling Hot'
import pandas as pd 
df = pd.read_csv("cat_train.csv")
df[df.ord_2 == 'Boiling Hot'].shape

(84790, 25)

In [14]:
# calculate above with pandas groupby function
df.groupby("ord_2")["id"].count()

ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
Warm           124239
Name: id, dtype: int64

In [15]:
# replace ord_2 column with its count values to convert it to numerical column (using transform function of pandas)
df.groupby("ord_2")["id"].transform("count")

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

In [16]:
# we can also group by multiple columns and their counts
df.groupby(
    [
        "ord_1",
        "ord_2"
    ]
)["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,Boiling Hot,15634
1,Contributor,Cold,17734
2,Contributor,Freezing,26082
3,Contributor,Hot,12428
4,Contributor,Lava Hot,11919
5,Contributor,Warm,22774
6,Expert,Boiling Hot,19477
7,Expert,Cold,22956
8,Expert,Freezing,33249
9,Expert,Hot,15792


#### - Create new categorical features from the existing ones

In [17]:
# concatenating names of ord_1 and ord_2 columns using an underscore
df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
)
df.new_feature

0                 Contributor_Hot
1                Grandmaster_Warm
2                    nan_Freezing
3                 Novice_Lava Hot
4                Grandmaster_Cold
                   ...           
599995            Novice_Freezing
599996         Novice_Boiling Hot
599997       Contributor_Freezing
599998                Master_Warm
599999    Contributor_Boiling Hot
Name: new_feature, Length: 600000, dtype: object

In [18]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,new_feature
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,Contributor_Hot
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,Grandmaster_Warm
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,3.0,,Freezing,n,P,eN,5.0,9.0,0,nan_Freezing
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0,Novice_Lava Hot
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,Grandmaster_Cold


In [19]:
# we can also combine three or four or even more features
df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
    + "_"
    + df.ord_3.astype(str)
)
df.new_feature

0                 Contributor_Hot_c
1                Grandmaster_Warm_e
2                    nan_Freezing_n
3                 Novice_Lava Hot_a
4                Grandmaster_Cold_h
                    ...            
599995            Novice_Freezing_a
599996         Novice_Boiling Hot_n
599997       Contributor_Freezing_n
599998                Master_Warm_m
599999    Contributor_Boiling Hot_b
Name: new_feature, Length: 600000, dtype: object

### Handling NaN values

#### 1. Drop them

In [20]:
# Load the data
df = pd.read_csv("cat_train.csv")

# View the number of NaN values
df.isna().sum()

id            0
bin_0     17894
bin_1     18003
bin_2     17930
bin_3     18014
bin_4     18047
nom_0     18252
nom_1     18156
nom_2     18035
nom_3     18121
nom_4     18035
nom_5     17778
nom_6     18131
nom_7     18003
nom_8     17755
nom_9     18073
ord_0     18288
ord_1     18041
ord_2     18075
ord_3     17916
ord_4     17930
ord_5     17713
day       17952
month     17988
target        0
dtype: int64

In [21]:
# drop NaN values
df.dropna(inplace=True)

# View number of NaN values
df.isna().sum() 

id        0
bin_0     0
bin_1     0
bin_2     0
bin_3     0
bin_4     0
nom_0     0
nom_1     0
nom_2     0
nom_3     0
nom_4     0
nom_5     0
nom_6     0
nom_7     0
nom_8     0
nom_9     0
ord_0     0
ord_1     0
ord_2     0
ord_3     0
ord_4     0
ord_5     0
day       0
month     0
target    0
dtype: int64

#### 2. Treat NaN values as a completely new category

In [22]:
# Load the data
df = pd.read_csv("cat_train.csv")

# View the value counts of ord_2 column
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [23]:
# fill NaNs with None
df.ord_2.fillna("NONE").value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [24]:
# Label Encoding the train and test data
import pandas as pd 
from sklearn import preprocessing

# read training data
train = pd.read_csv("cat_train.csv")

# read test data
test = pd.read_csv("cat_test.csv")

# create a fake target column for test data
# since this column doesn't exist
test.loc[:, "target"] = -1

# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)

# make a list of features we are interested in 
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id","target"]]

# loop over the feature list
for feat in features:
    # create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()
    
    # note the trick here
    # since its categorical data, we fillna with a string
    # and we convert all the data to string type
    # so, no matter its int or float, its converted to string
    # int/float but categorical !!!
    temp_col = data[feat].fillna("NONE").astype(str).values
    
    # we can use fit_transform here as we do not have any extra test data that we need to transform on separately
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
    
# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [32]:
# View train data
train.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
43034,43034,0,0,0,0,0,0,0,3,3,...,307,0,5,1,11,16,3,1,4,0
431060,431060,0,1,0,2,2,0,2,3,4,...,1599,2,2,1,2,7,143,5,9,0
37982,37982,0,0,0,0,0,3,5,5,4,...,1660,1,1,6,14,6,169,5,10,0
276099,276099,0,0,1,0,0,0,5,3,6,...,2123,2,1,4,8,17,90,6,7,1
571987,571987,0,0,1,0,0,0,2,2,6,...,1460,2,1,6,14,5,62,4,9,0


In [33]:
# View test data
test.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
48489,648489,0,0,0,2,0,0,4,0,4,...,1400,0,2,6,2,21,41,0,2,-1
60082,660082,0,0,1,0,2,3,5,0,4,...,1439,2,1,1,15,2,21,6,5,-1
58524,658524,0,0,0,0,2,0,6,2,0,...,258,1,1,1,6,5,46,4,9,-1
10190,610190,0,1,1,2,0,3,5,0,4,...,1811,2,5,3,8,13,149,4,9,-1
228538,828538,0,0,0,0,0,3,5,2,2,...,407,2,1,3,0,16,115,2,10,-1


In [34]:
# View categories in ord_4 column
df.ord_4.fillna("NONE").value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

- We define a value `rare` if it is less than 2000

In [37]:
df.ord_4 = df.ord_4.fillna("NONE")

df.loc[
    df["ord_4"].value_counts()[df["ord_4"]].values < 2000, "ord_4"
] = "RARE"

df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64

## Build a model

### `create_folds.py`

In [38]:
# create_folds.py
import pandas as pd 
from sklearn import model_selection

if __name__ == "__main__":
    
    # read training data
    df = pd.read_csv("cat_train.csv")
    
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1 
    
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch labels
    y = df.target.values
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for f,(t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
        
    # save the new csv with kfold column
    df.to_csv("cat_train_folds.csv", index=False)

In [40]:
df.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,kfold
209590,317929,0.0,0.0,1.0,F,N,Red,Trapezoid,Axolotl,India,...,1.0,Grandmaster,Cold,n,O,AI,1.0,12.0,1,1
503362,369652,0.0,0.0,1.0,F,Y,Red,Circle,Dog,Costa Rica,...,1.0,Novice,,i,K,ne,7.0,3.0,0,4
192757,262650,0.0,0.0,0.0,F,Y,Blue,Trapezoid,Lion,Finland,...,1.0,Grandmaster,Cold,m,K,pT,1.0,12.0,0,1
369159,594242,,0.0,1.0,T,Y,Red,Trapezoid,Hamster,Finland,...,3.0,Novice,Lava Hot,h,Q,oU,,12.0,1,3
419410,439367,0.0,0.0,0.0,T,N,Blue,Star,Axolotl,China,...,2.0,Novice,Warm,n,R,pT,6.0,2.0,0,3


In [41]:
# Check our new folds csv to see the number of samples per fold
import pandas as pd 

df = pd.read_csv("cat_train_folds.csv")

df.kfold.value_counts()

4    120000
3    120000
2    120000
1    120000
0    120000
Name: kfold, dtype: int64

In [42]:
# Check target distribution per fold
df[df.kfold==0].target.value_counts()

0    97536
1    22464
Name: target, dtype: int64

In [43]:
df[df.kfold==1].target.value_counts()

0    97536
1    22464
Name: target, dtype: int64

In [44]:
df[df.kfold==2].target.value_counts()

0    97535
1    22465
Name: target, dtype: int64

In [45]:
df[df.kfold==3].target.value_counts()

0    97535
1    22465
Name: target, dtype: int64

In [46]:
df[df.kfold==4].target.value_counts()

0    97535
1    22465
Name: target, dtype: int64

### *Simple Model*: One Hot Encoding + Logistic Regression

### `ohe_logres.py`

In [49]:
# ohe_logres.py

import pandas as pd 

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(auc)

if __name__ == "__main__":
    # run function for fold = 0
    # we can just replace this number and run this for any fold
    run(0)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.786737590932164


In [56]:
# Change the code to run it for all folds
# ohe_logres.py

import warnings
warnings.filterwarnings("ignore")

import pandas as pd 

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.786737590932164
Fold = 1, AUC = 0.7864134324601486
Fold = 2, AUC = 0.7891949604718245
Fold = 3, AUC = 0.7865868586086606
Fold = 4, AUC = 0.7841822012086013


### *Another Model*: Label Encoding + Random Forest

### `lbl_rf.py`

In [57]:
# lbl_rf.py

import pandas as pd 

from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize Random Forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # fit model on training data
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.7163084239682881
Fold = 1, AUC = 0.7151674406595026
Fold = 2, AUC = 0.7194365594431105
Fold = 3, AUC = 0.7156918033989204
Fold = 4, AUC = 0.7129110294100113


### *Another model*: One Hot Encoding + fitting TruncatedSVD on sparse matrix with training + validation data

### `ohe_svd_rf.py`

In [59]:
# ohe_svd_rf.py

# lbl_rf.py

import pandas as pd 

from scipy import sparse
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]], axis=0
    )
    ohe.fit(full_data[features])

    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize Truncated SVD
    # we are reducing the data to 120 components
    svd = decomposition.TruncatedSVD(n_components=120)
    
    # fit SVD on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)

    # transform sparse training data
    x_train = svd.transform(x_train)
    
    # transform sparse validation data
    x_valid = svd.transform(x_valid)
    
    # initialize Random Forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.707345889057882
Fold = 1, AUC = 0.7056061509164883
Fold = 2, AUC = 0.7093872850701919


KeyboardInterrupt: 

### *Another Model*: Label Encoding + XGBoost

### `lbl_xgb.py`

In [60]:
# lbl_rf.py

import pandas as pd 

import xgboost as xgb

from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("cat_train_folds.csv")
    
    # all columns are features except id, target and kfold columns
    features = [x for x in df.columns if x not in ("id", "target", "kfold")]
    
    # fill all NaN values with NONE
    # note that all columns are converted to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # now its time to label encode the features
    for col in features:
        # initialize LabelEncoder for each column
        lbl = preprocessing.LabelEncoder()
        
        # fit label encoder on all data
        lbl.fit(df[col])
        
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
      
    # get training data
    x_train = df_train[features].values
    
    # get validation data
    x_valid = df_valid[features].values
    
    # initialize XGBoost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200
    )
    
    # fit model on training data 
    model.fit(x_train, df_train.target.values)
    
    # prediction on validation data
    # we need probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc_auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)    

Fold = 0, AUC = 0.7627633684951625
Fold = 1, AUC = 0.7625637056582746
Fold = 2, AUC = 0.7636508512623847
Fold = 3, AUC = 0.7625421350284056
Fold = 4, AUC = 0.7606497734250544
