In [1]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[K     |████████████████████████████████| 276 kB 7.6 MB/s 
Installing collected packages: feature-engine
Successfully installed feature-engine-1.4.0


In [13]:
import pandas as pd
import numpy as np
from feature_engine.encoding import OneHotEncoder


In [14]:
ds = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2','X3', 'X4','X5', 'X6'])

In [15]:
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [16]:
# lets find out how many unique categories are present in each columns
for column in ds.columns:
  print(f"Column {column} has {len(ds[column].unique())} categories")

Column X1 has 27 categories
Column X2 has 44 categories
Column X3 has 7 categories
Column X4 has 4 categories
Column X5 has 29 categories
Column X6 has 12 categories


In [17]:
# creating and fitting the one hot encoder with feature engine
ohe_enc = OneHotEncoder(
    top_categories=10,  # you can change this value to select more or less variables
    # we can select which variables to encode
    variables=['X1', 'X2', 'X3','X4','X5','X6'],
    drop_last=False)

ohe_enc.fit(ds)

OneHotEncoder(top_categories=10, variables=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])

In [18]:
# lets observe the top 10 selected categories of each features

ohe_enc.encoder_dict_

{'X1': ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'],
 'X2': ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'],
 'X3': ['c', 'f', 'a', 'd', 'g', 'e', 'b'],
 'X4': ['d', 'a', 'b', 'c'],
 'X5': ['w', 'v', 'q', 'r', 's', 'd', 'n', 'p', 'm', 'i'],
 'X6': ['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']}

In [19]:
# let's see the list of variables that the encoder will transform
ohe_enc.variables_

['X1', 'X2', 'X3', 'X4', 'X5', 'X6']

In [20]:
ds = ohe_enc.transform(ds)

# let's explore the result
ds.head()

Unnamed: 0,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Advantages
- quick
- creates the same number of features in train and test set

### Note

If the argument variables is left to None, then the encoder will automatically identify all categorical variables.

The encoder will not encode numerical variables. So if some of your numerical variables are in fact categories, you will need to re-cast them as object before using the encoder.