In [29]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[K     |████████████████████████████████| 276 kB 27.3 MB/s 
Installing collected packages: feature-engine
Successfully installed feature-engine-1.4.0


In [30]:
import pandas as pd
import numpy as np
from feature_engine.encoding import OrdinalEncoder

In [31]:
ds = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2','X3', 'X4','X5', 'X6'])

In [32]:
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [35]:
# lets find out how many unique categories are present in each columns
for column in ds.columns:
  print(f"Column {column} has {len(ds[column].unique())} categories")

Column X1 has 27 categories
Column X2 has 44 categories
Column X3 has 7 categories
Column X4 has 4 categories
Column X5 has 29 categories
Column X6 has 12 categories


In [37]:
ds.columns.to_list()

['X1', 'X2', 'X3', 'X4', 'X5', 'X6']

In [38]:
# create and fit the model

ordinal_enc = OrdinalEncoder(
    encoding_method='arbitrary',
    variables=ds.columns.to_list())

ordinal_enc.fit(ds)

OrdinalEncoder(encoding_method='arbitrary',
               variables=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])

In [39]:
# let's see the number assigned to each category
ordinal_enc.encoder_dict_

{'X1': {'v': 0,
  't': 1,
  'w': 2,
  'b': 3,
  'r': 4,
  'l': 5,
  's': 6,
  'aa': 7,
  'c': 8,
  'a': 9,
  'e': 10,
  'h': 11,
  'z': 12,
  'j': 13,
  'o': 14,
  'u': 15,
  'p': 16,
  'n': 17,
  'i': 18,
  'y': 19,
  'd': 20,
  'f': 21,
  'm': 22,
  'k': 23,
  'g': 24,
  'q': 25,
  'ab': 26},
 'X2': {'at': 0,
  'av': 1,
  'n': 2,
  'e': 3,
  'as': 4,
  'aq': 5,
  'r': 6,
  'ai': 7,
  'ak': 8,
  'm': 9,
  'a': 10,
  'k': 11,
  'ae': 12,
  's': 13,
  'f': 14,
  'd': 15,
  'ag': 16,
  'ay': 17,
  'ac': 18,
  'ap': 19,
  'g': 20,
  'i': 21,
  'aw': 22,
  'y': 23,
  'b': 24,
  'ao': 25,
  'al': 26,
  'h': 27,
  'x': 28,
  'au': 29,
  't': 30,
  'an': 31,
  'z': 32,
  'ah': 33,
  'p': 34,
  'am': 35,
  'j': 36,
  'q': 37,
  'af': 38,
  'l': 39,
  'aa': 40,
  'c': 41,
  'o': 42,
  'ar': 43},
 'X3': {'a': 0, 'e': 1, 'c': 2, 'f': 3, 'd': 4, 'b': 5, 'g': 6},
 'X4': {'d': 0, 'b': 1, 'c': 2, 'a': 3},
 'X5': {'u': 0,
  'y': 1,
  'x': 2,
  'h': 3,
  'g': 4,
  'f': 5,
  'j': 6,
  'i': 7,
  'd': 8,


In [40]:
# lets find the lists of features that the encoder will transform
ordinal_enc.variables_

['X1', 'X2', 'X3', 'X4', 'X5', 'X6']

In [41]:
# Transform and view the result
ds = ordinal_enc.transform(ds)

# let's view the result
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,0,0,0,0,0,0
1,1,1,1,0,1,1
2,2,2,2,0,2,0
3,1,2,3,0,2,1
4,0,2,3,0,3,2


### Note:
If the argument variables is left to None/ not provided, then the encoder will automatically identify all categorical variables. 

The encoder will not encode numerical variables. So if some of your numerical variables are in fact categories, you will need to re-cast them as object before using the encoder.

If there is a variable in the test set, for which the encoder doesn't have a number to assigned (the category was not seen in the train set), the encoder will return an error.