# Ordinal encoding

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# let's load the data set

data = pd.read_csv("credit_approval_uci.csv")

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

# Ordinal encoding with pandas

In [4]:
# let's create a dictionary with the mappings of categories to numbers for A7

ordinal_mapping = {k: i for i, k in enumerate(X_train["A7"].unique(), 0)}

ordinal_mapping

{'v': 0,
 'ff': 1,
 'h': 2,
 'dd': 3,
 'z': 4,
 'bb': 5,
 'j': 6,
 'Missing': 7,
 'n': 8,
 'o': 9}

In [5]:
# replace the labels with the integers

X_train["A7"] = X_train["A7"].map(ordinal_mapping)
X_test["A7"] = X_test["A7"].map(ordinal_mapping)

In [6]:
# let's explore the result

X_train["A7"].head(10)

596    0
303    0
204    0
351    1
118    0
247    2
652    0
513    3
230    0
250    4
Name: A7, dtype: int64

# Ordinal encoding with Scikit-learn

In [7]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [8]:
# Let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [9]:
# let's set up the encoder

enc = OrdinalEncoder()

In [10]:
# let's select the categorical variables

vars_categorical = X_train.select_dtypes(include="O").columns.to_list()

vars_categorical

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [11]:
# Capture remaining variables in a list

vars_remainder = X_train.select_dtypes(exclude="O").columns.to_list()

vars_remainder

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [12]:
# Indicate which variables to encode:

ct = ColumnTransformer(
    [("encoder", enc, vars_categorical)],
    remainder="passthrough",
)

# Create category to integer mapping:
ct.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder', OrdinalEncoder(),
                                 ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10',
                                  'A12', 'A13'])])

In [13]:
# we can see the unique categories learned by
# the encoder

ct.named_transformers_["encoder"].categories_

[array(['Missing', 'a', 'b'], dtype=object),
 array(['Missing', 'l', 'u', 'y'], dtype=object),
 array(['Missing', 'g', 'gg', 'p'], dtype=object),
 array(['Missing', 'aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm',
        'q', 'r', 'w', 'x'], dtype=object),
 array(['Missing', 'bb', 'dd', 'ff', 'h', 'j', 'n', 'o', 'v', 'z'],
       dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['g', 'p', 's'], dtype=object)]

In [14]:
# let's transform train and test sets

X_train_enc = ct.transform(X_train)
X_test_enc = ct.transform(X_test)

In [15]:
# let's convert the arrays to pandas dataframes

X_train_enc = pd.DataFrame(X_train_enc, columns=vars_categorical + vars_remainder)

X_test_enc = pd.DataFrame(X_test_enc, columns=vars_categorical + vars_remainder)

In [16]:
# let's inspect the result

X_train_enc.head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13,A2,A3,A8,A11,A14,A15
0,1.0,2.0,1.0,2.0,8.0,1.0,1.0,1.0,0.0,46.08,3.0,2.375,8.0,396.0,4159.0
1,1.0,2.0,1.0,11.0,8.0,0.0,0.0,0.0,0.0,15.92,2.875,0.085,0.0,120.0,0.0
2,2.0,3.0,3.0,13.0,8.0,1.0,1.0,0.0,0.0,36.33,2.125,0.085,1.0,50.0,1187.0
3,2.0,3.0,3.0,6.0,3.0,0.0,0.0,0.0,0.0,22.17,0.585,0.0,0.0,100.0,0.0
4,2.0,2.0,1.0,10.0,8.0,1.0,1.0,1.0,0.0,57.83,7.04,14.0,6.0,360.0,1332.0


In [17]:
X_test_enc.head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13,A2,A3,A8,A11,A14,A15
0,1.0,2.0,1.0,11.0,8.0,1.0,1.0,1.0,0.0,45.83,10.5,5.0,7.0,0.0,0.0
1,2.0,2.0,1.0,14.0,4.0,1.0,1.0,1.0,0.0,64.08,20.0,17.5,9.0,0.0,1000.0
2,1.0,2.0,1.0,3.0,4.0,1.0,1.0,1.0,0.0,31.25,3.75,0.625,9.0,181.0,0.0
3,2.0,2.0,1.0,10.0,8.0,1.0,1.0,0.0,0.0,39.25,9.5,6.5,14.0,240.0,4607.0
4,1.0,2.0,1.0,8.0,5.0,0.0,0.0,1.0,0.0,26.17,2.0,0.0,0.0,276.0,1.0


# Ordinal encoding with Feature-engine

In [18]:
from feature_engine.encoding import OrdinalEncoder

In [19]:
# let's create the encoder

enc = OrdinalEncoder(
    encoding_method="arbitrary",
    variables=vars_categorical,
)

In [20]:
# let's fit the encoder to the train set

enc.fit(X_train)

OrdinalEncoder(encoding_method='arbitrary',
               variables=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12',
                          'A13'])

In [21]:
# in the encoder_dict_ we can observe the numbers
# assigned to each category for all the variables

enc.encoder_dict_

{'A1': {'a': 0, 'b': 1, 'Missing': 2},
 'A4': {'u': 0, 'y': 1, 'Missing': 2, 'l': 3},
 'A5': {'g': 0, 'p': 1, 'Missing': 2, 'gg': 3},
 'A6': {'c': 0,
  'q': 1,
  'w': 2,
  'ff': 3,
  'm': 4,
  'i': 5,
  'e': 6,
  'cc': 7,
  'x': 8,
  'd': 9,
  'k': 10,
  'j': 11,
  'Missing': 12,
  'aa': 13,
  'r': 14},
 'A7': {'v': 0,
  'ff': 1,
  'h': 2,
  'dd': 3,
  'z': 4,
  'bb': 5,
  'j': 6,
  'Missing': 7,
  'n': 8,
  'o': 9},
 'A9': {'t': 0, 'f': 1},
 'A10': {'t': 0, 'f': 1},
 'A12': {'t': 0, 'f': 1},
 'A13': {'g': 0, 's': 1, 'p': 2}}

In [22]:
# let's transform the train and test sets

X_train_enc = enc.transform(X_train)
X_test_enc = enc.transform(X_test)

In [23]:
# let's explore the result

X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0,46.08,3.0,0,0,0,0,2.375,0,0,8,0,0,396.0,4159
303,0,15.92,2.875,0,0,1,0,0.085,1,1,0,1,0,120.0,0
204,1,36.33,2.125,1,1,2,0,0.085,0,0,1,1,0,50.0,1187
351,1,22.17,0.585,1,1,3,1,0.0,1,1,0,1,0,100.0,0
118,1,57.83,7.04,0,0,4,0,14.0,0,0,6,0,0,360.0,1332


In [24]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,0,45.83,10.5,0,0,1,0,5.0,0,0,7,0,0,0.0,0
586,1,64.08,20.0,0,0,8,2,17.5,0,0,9,0,0,0.0,1000
140,0,31.25,3.75,0,0,7,2,0.625,0,0,9,0,0,181.0,0
492,1,39.25,9.5,0,0,4,0,6.5,0,0,14,1,0,240.0,4607
350,0,26.17,2.0,0,0,11,6,0.0,1,1,0,0,0,276.0,1


# Ordinal encoding with Category Encoders

In [25]:
from category_encoders.ordinal import OrdinalEncoder

In [26]:
# set up the encoder

enc = OrdinalEncoder(cols=vars_categorical)

In [27]:
# let's fit the encoder to the train set

enc.fit(X_train)

OrdinalEncoder(cols=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'],
               mapping=[{'col': 'A1', 'data_type': dtype('O'),
                         'mapping': a          1
b          2
Missing    3
NaN       -2
dtype: int64},
                        {'col': 'A4', 'data_type': dtype('O'),
                         'mapping': u          1
y          2
Missing    3
l          4
NaN       -2
dtype: int64},
                        {'col': 'A5', 'data_type': dtype('O'),
                         'mapping': g          1
p          2
Missing    3
gg         4
NaN       -2
dtype: int64},
                        {'col': 'A6', 'data_type': dtype('O'),
                         'mappi...
                        {'col': 'A7', 'data_type': dtype('O'),
                         'mapping': v           1
ff          2
h           3
dd          4
z           5
bb          6
j           7
Missing     8
n           9
o          10
NaN        -2
dtype: int64},
                        {'col':

In [28]:
# the replacement values are stored in the attribute
# mapping

enc.mapping

[{'col': 'A1',
  'mapping': a          1
  b          2
  Missing    3
  NaN       -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col': 'A4',
  'mapping': u          1
  y          2
  Missing    3
  l          4
  NaN       -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col': 'A5',
  'mapping': g          1
  p          2
  Missing    3
  gg         4
  NaN       -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col': 'A6',
  'mapping': c           1
  q           2
  w           3
  ff          4
  m           5
  i           6
  e           7
  cc          8
  x           9
  d          10
  k          11
  j          12
  Missing    13
  aa         14
  r          15
  NaN        -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col': 'A7',
  'mapping': v           1
  ff          2
  h           3
  dd          4
  z           5
  bb          6
  j           7
  Missing     8
  n           9
  o          10
  NaN        -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col':

In [29]:
# let's transform the train and test sets

X_train_enc = enc.transform(X_train)
X_test_enc = enc.transform(X_test)

In [30]:
# let's explore the result

X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,1,46.08,3.0,1,1,1,1,2.375,1,1,8,1,1,396.0,4159
303,1,15.92,2.875,1,1,2,1,0.085,2,2,0,2,1,120.0,0
204,2,36.33,2.125,2,2,3,1,0.085,1,1,1,2,1,50.0,1187
351,2,22.17,0.585,2,2,4,2,0.0,2,2,0,2,1,100.0,0
118,2,57.83,7.04,1,1,5,1,14.0,1,1,6,1,1,360.0,1332


In [31]:
# let's explore the result

X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


In [32]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,1,45.83,10.5,1,1,2,1,5.0,1,1,7,1,1,0.0,0
586,2,64.08,20.0,1,1,9,3,17.5,1,1,9,1,1,0.0,1000
140,1,31.25,3.75,1,1,8,3,0.625,1,1,9,1,1,181.0,0
492,2,39.25,9.5,1,1,5,1,6.5,1,1,14,2,1,240.0,4607
350,1,26.17,2.0,1,1,12,7,0.0,2,2,0,1,1,276.0,1
