# Approaching Categorical Variables

In [5]:
import pandas as pd 
df = pd.read_csv("cat_train.csv")
df.sample(10)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
475886,475886,0.0,0.0,0.0,T,Y,,Triangle,Hamster,India,Piano,1d0f532e9,b77c9d835,8d857a0a1,46a005ddf,14d2855ae,2.0,Expert,Warm,f,K,be,1.0,9.0,0
235188,235188,0.0,0.0,0.0,T,Y,Blue,Trapezoid,Lion,India,Bassoon,572d48b94,309404a26,bc9cc2a94,0f94eb834,7432851db,,Expert,Warm,b,U,,5.0,1.0,0
269607,269607,0.0,0.0,1.0,F,N,Red,Polygon,Hamster,Russia,Theremin,e3100c248,336f3d094,f14f5e4a5,2061350b9,,3.0,Expert,Lava Hot,n,Y,nn,7.0,11.0,1
568723,568723,0.0,0.0,1.0,F,Y,Blue,Polygon,Axolotl,India,Bassoon,7662ca2a1,3d7adf24d,cd9feb5c6,abce980f5,4fba0e75d,3.0,Grandmaster,Cold,f,M,Mg,1.0,3.0,0
200487,200487,0.0,1.0,0.0,F,Y,Red,Star,Axolotl,Costa Rica,Piano,39879cc3e,23dbf9179,1bd74ace9,63dde2492,9cb07e3d3,1.0,Expert,Boiling Hot,m,,fe,3.0,12.0,0
475376,475376,0.0,,1.0,F,N,Red,Triangle,Lion,Finland,Theremin,0cd1f95ca,a4bd85b39,41901460a,dcccc6c19,1dc6039d3,3.0,Master,Warm,e,V,dh,5.0,5.0,0
457936,457936,0.0,0.0,0.0,T,N,Blue,Polygon,Hamster,Costa Rica,Theremin,b383d9e61,68d938090,e61c53286,ad5be5be4,a2980be3d,2.0,Novice,Cold,b,T,DN,4.0,5.0,0
237648,237648,0.0,0.0,0.0,T,Y,Red,Circle,Hamster,Canada,Theremin,af940d20b,f6c95db72,dd4d69481,ca9ad1d4b,efb3831f1,3.0,Master,Cold,e,A,iS,3.0,8.0,0
572275,572275,0.0,1.0,1.0,F,N,Blue,Triangle,Axolotl,Finland,Bassoon,ed905bc0f,980aa6ede,7e1c4cff1,,9658e20fe,2.0,Novice,Freezing,d,P,AU,3.0,12.0,0
577330,577330,0.0,0.0,0.0,,N,Blue,Polygon,Hamster,Russia,Theremin,c678329c4,d3ce68737,6bc753dd6,690411ac0,bb9d121e7,2.0,Contributor,Lava Hot,n,P,cy,6.0,8.0,0


In [6]:
df['ord_2'].unique()

array(['Hot', 'Warm', 'Freezing', 'Lava Hot', 'Cold', 'Boiling Hot', nan],
      dtype=object)

In [7]:
# View the value counts before mapping
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

### Most important ways to handle categorical variables
1. Label Encoding
2. Sparse Matrices
3. One Hot Encoding

### Label Encoding

In [8]:
mapping = {
    "Freezing": 0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5
}

In [9]:
# Label Encoding - Convert categories to numbers
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

# View the value counts after mapping
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [10]:
# Label Encoding using scikit-learn
import pandas as pd 
from sklearn import preprocessing

# read the data
df = pd.read_csv("cat_train.csv")

# fill the NaN values in ord_2 column (because LabelEncoder of scikit-learn does not handle NaN values)
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# P.S.: do not use this directly, fit first and then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

df.head()



Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,3,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,6,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,2,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,4,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,1,h,C,OZ,5.0,12.0,0


### Sparse Matrices

In [11]:
# size of a numpy array
import numpy as np

# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# print size in bytes
print(example.nbytes)

36


In [12]:
# convert numpy array to sparse matrix
import numpy as np
from scipy import sparse

# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(sparse_example.data.nbytes)

16


In [14]:
# total size of sparse csr matrix
print(
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

48


In [15]:
# Comparison of sizes of dense and sparse arrays
import numpy as np 
from scipy import sparse 

# number of rows
n_rows = 10000

# number of columns
n_cols = 100000

# create random binary matrix with only 5% values as 1s
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 4000000000
Size of sparse array: 200025748
Full size of sparse array: 400091500


### One Hot Encoding

In [16]:
# Comparison of sizes of dense and binarized arrays
import numpy as np 
from scipy import sparse 

# create binary matrix
example = np.array(
    [
        [0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]
    ]
)

# print size in bytes
print(f"Size of dense array: {example.nbytes}")

# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)

# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")

full_size = (
    sparse_example.data.nbytes +
    sparse_example.indptr.nbytes + 
    sparse_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 72
Size of sparse array: 12
Full size of sparse array: 40


In [17]:
# Use scikit-learn's OneHotEncoder to transform a much larger feature array with 1001 categories
import numpy as np 
from sklearn import preprocessing

# create random 1D array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")

# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get sparse array
ohe = preprocessing.OneHotEncoder(sparse=True)

# fit and transform data with sparse one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# print size in bytes for sparse array
print(f"Size of sparse array: {ohe_example.data.nbytes}")

full_size = (
    ohe_example.data.nbytes +
    ohe_example.indptr.nbytes +
    ohe_example.indices.nbytes
)

# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 8000000000
Size of sparse array: 8000000
Full size of sparse array: 16000004


### Other ways to handle categorical variables:

#### - Converting categorical variables to numerical variables

In [23]:
# shape of dataframe where 'ord_2' column has value 'Boiling Hot'
import pandas as pd 
df = pd.read_csv("cat_train.csv")
df[df.ord_2 == 'Boiling Hot'].shape

(84790, 25)

In [24]:
# calculate above with pandas groupby function
df.groupby("ord_2")["id"].count()

ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
Warm           124239
Name: id, dtype: int64

In [25]:
# replace ord_2 column with its count values to convert it to numerical column (using transform function of pandas)
df.groupby("ord_2")["id"].transform("count")

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

In [27]:
# we can also group by multiple columns and their counts
df.groupby(
    [
        "ord_1",
        "ord_2"
    ]
)["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,Boiling Hot,15634
1,Contributor,Cold,17734
2,Contributor,Freezing,26082
3,Contributor,Hot,12428
4,Contributor,Lava Hot,11919
5,Contributor,Warm,22774
6,Expert,Boiling Hot,19477
7,Expert,Cold,22956
8,Expert,Freezing,33249
9,Expert,Hot,15792


#### - Create new categorical features from the existing ones

In [30]:
# concatenating names of ord_1 and ord_2 columns using an underscore
df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
)
df.new_feature

0                 Contributor_Hot
1                Grandmaster_Warm
2                    nan_Freezing
3                 Novice_Lava Hot
4                Grandmaster_Cold
                   ...           
599995            Novice_Freezing
599996         Novice_Boiling Hot
599997       Contributor_Freezing
599998                Master_Warm
599999    Contributor_Boiling Hot
Name: new_feature, Length: 600000, dtype: object

In [31]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,new_feature
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,Contributor_Hot
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,Grandmaster_Warm
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0,nan_Freezing
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0,Novice_Lava Hot
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,Grandmaster_Cold


In [32]:
# we can also combine three or four or even more features
df["new_feature"] = (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
    + "_"
    + df.ord_3.astype(str)
)
df.new_feature

0                 Contributor_Hot_c
1                Grandmaster_Warm_e
2                    nan_Freezing_n
3                 Novice_Lava Hot_a
4                Grandmaster_Cold_h
                    ...            
599995            Novice_Freezing_a
599996         Novice_Boiling Hot_n
599997       Contributor_Freezing_n
599998                Master_Warm_m
599999    Contributor_Boiling Hot_b
Name: new_feature, Length: 600000, dtype: object