# Category Encoding

In [59]:
# pip install category_encoders

In [94]:
import numpy as np
import pandas as pd

In [95]:
import category_encoders as ce

In [96]:
df = pd.DataFrame(
    {
        'Size': np.random.choice(['XS', 'S', 'M', 'L', 'XL', 'XXL'], 10),
        'Brand': np.random.choice(['Nike', 'Puma', 'Adidas', 'Le Coq', 'Reebok'], 10),
    }
)

In [97]:
df.sample(n=4)

Unnamed: 0,Size,Brand
8,XXL,Adidas
2,XS,Le Coq
3,XL,Reebok
7,L,Le Coq


In [98]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.5)

In [99]:
df_train['Size'].value_counts().index.values.tolist(), df_test['Size'].value_counts().index.values.tolist(), 

(['XL', 'XXL', 'L', 'M'], ['S', 'XL', 'XS', 'XXL'])

In [100]:
df_train['Brand'].value_counts().index.values.tolist(), df_test['Brand'].value_counts().index.values.tolist(), 

(['Puma', 'Le Coq', 'Nike', 'Reebok'], ['Reebok', 'Le Coq', 'Adidas', 'Nike'])

In [107]:
from category_encoders.one_hot import OneHotEncoder
encoder = OneHotEncoder(use_cat_names=True, handle_unknown='return_nan')
x_train = encoder.fit_transform(df_train)
x_test = encoder.transform(df_test)

In [108]:
df_train

Unnamed: 0,Size,Brand
3,XL,Reebok
7,L,Le Coq
0,XL,Puma
6,XXL,Puma
9,M,Nike


In [109]:
df_test

Unnamed: 0,Size,Brand
4,XL,Nike
5,S,Reebok
1,S,Reebok
2,XS,Le Coq
8,XXL,Adidas


In [110]:
x_test

Unnamed: 0,Size_XL,Size_L,Size_XXL,Size_M,Brand_Reebok,Brand_Le Coq,Brand_Puma,Brand_Nike
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,,,,,1.0,0.0,0.0,0.0
1,,,,,1.0,0.0,0.0,0.0
2,,,,,0.0,1.0,0.0,0.0
8,0.0,0.0,1.0,0.0,,,,


In [103]:

encoder = ce.one_hot.OneHotEncoder(use_cat_names=True, handle_unknown='value')
x_train = encoder.fit_transform(df_train)
x_test = encoder.transform(df_test)

In [58]:
x_test.sample(n=3)

Unnamed: 0,Size_XS,Size_M,Size_L,Size_S,Size_XL,Brand_Le Coq,Brand_Puma,Brand_Reebok
3,0,1,0,0,0,0,0,1
6,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,1,0


In [113]:
df_size = df[['Size']].copy()
df_size_train, df_size_test = train_test_split(df_size, test_size=0.5)

from category_encoders.ordinal import OrdinalEncoder

oencoder = OrdinalEncoder(
    mapping= [
        {
            'col': 'Size', 
            'mapping': {'XS': 1, 'S': 2, 'M': 3, 'L': 4, 'XL': 5}
        }
    ]
)

df_train.loc[:, 'Size [Ordinal Encoded]'] = oencoder.fit_transform(df_train['Size'])['Size'].values
df_test.loc[:, 'Size [Ordinal Encoded]'] = oencoder.transform(df_test['Size'])['Size'].values

In [114]:
df_test.head(5)

Unnamed: 0,Size,Brand,Size [Ordinal Encoded]
4,XL,Nike,5.0
5,S,Reebok,2.0
1,S,Reebok,2.0
2,XS,Le Coq,1.0
8,XXL,Adidas,-1.0
