# Category Encoding

In [26]:
# pip install category_encoders

In [27]:
import numpy as np
import pandas as pd

In [28]:
import category_encoders as ce

In [29]:
df = pd.DataFrame(
    {
        'Size': np.random.choice(['XS', 'S', 'M', 'L', 'XL', 'XXL'], 10),
        'Brand': np.random.choice(['Nike', 'Puma', 'Adidas', 'Le Coq', 'Reebok'], 10),
    }
)

In [30]:
df.sample(n=4)

Unnamed: 0,Size,Brand
5,L,Puma
7,XS,Adidas
3,XL,Puma
0,L,Reebok


# Splitting the Dataset

In [31]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.5)

In [32]:
df_train['Size'].value_counts().index.values.tolist(), df_test['Size'].value_counts().index.values.tolist(), 

(['XS', 'S', 'XXL', 'L'], ['XL', 'XS', 'S', 'L'])

In [33]:
df_train['Brand'].value_counts().index.values.tolist(), df_test['Brand'].value_counts().index.values.tolist(), 

(['Puma', 'Nike', 'Adidas'], ['Adidas', 'Le Coq', 'Puma', 'Reebok'])

# One-Hot Encoding

In [34]:
from category_encoders.one_hot import OneHotEncoder
encoder = OneHotEncoder(use_cat_names=True, handle_unknown='return_nan')
x_train = encoder.fit_transform(df_train)
x_test = encoder.transform(df_test)

In [35]:
df_train

Unnamed: 0,Size,Brand
8,XXL,Adidas
4,XS,Nike
5,L,Puma
2,XS,Nike
1,S,Puma


In [36]:
df_test

Unnamed: 0,Size,Brand
6,S,Le Coq
9,XL,Adidas
3,XL,Puma
0,L,Reebok
7,XS,Adidas


In [37]:
x_test

Unnamed: 0,Size_XXL,Size_XS,Size_L,Size_S,Brand_Adidas,Brand_Nike,Brand_Puma
6,0.0,0.0,0.0,1.0,,,
9,,,,,1.0,0.0,0.0
3,,,,,0.0,0.0,1.0
0,0.0,0.0,1.0,0.0,,,
7,0.0,1.0,0.0,0.0,1.0,0.0,0.0


Setting the unknown values to zero

In [38]:
encoder = ce.one_hot.OneHotEncoder(use_cat_names=True, handle_unknown='value')
x_train = encoder.fit_transform(df_train)
x_test = encoder.transform(df_test)

In [39]:
x_test.sample(n=3)

Unnamed: 0,Size_XXL,Size_XS,Size_L,Size_S,Brand_Adidas,Brand_Nike,Brand_Puma
0,0,0,1,0,0,0,0
7,0,1,0,0,1,0,0
6,0,0,0,1,0,0,0


# Ordinal Encoder

In [40]:
df_size = df[['Size']].copy()
df_size_train, df_size_test = train_test_split(df_size, test_size=0.5)

from category_encoders.ordinal import OrdinalEncoder

oencoder = OrdinalEncoder(
    mapping= [
        {
            'col': 'Size', 
            'mapping': {'XS': 1, 'S': 2, 'M': 3, 'L': 4, 'XL': 5}
        }
    ]
)

df_train.loc[:, 'Size [Ordinal Encoded]'] = oencoder.fit_transform(df_train['Size'])['Size'].values
df_test.loc[:, 'Size [Ordinal Encoded]'] = oencoder.transform(df_test['Size'])['Size'].values

In [41]:
df_test.head(5)

Unnamed: 0,Size,Brand,Size [Ordinal Encoded]
6,S,Le Coq,2
9,XL,Adidas,5
3,XL,Puma,5
0,L,Reebok,4
7,XS,Adidas,1
