In [1]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# for integer encoding using sklearn
from sklearn.preprocessing import OrdinalEncoder

# for integer encoding using feature-engine
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder

In [2]:
data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# make a list with the categorical variables

vars_categorical = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [4]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

## Ordinal encoding with pandas

In [5]:
# let's create a dictionary with the mappings of categories to numbers for A7

ordinal_mapping = {
    k: i
    for i, k in enumerate(X_train['A7'].unique(), 0)
}

ordinal_mapping

{'v': 0,
 'ff': 1,
 'h': 2,
 'dd': 3,
 'z': 4,
 'bb': 5,
 'j': 6,
 'Missing': 7,
 'n': 8,
 'o': 9}

In [6]:
# replace the labels with the integers

X_train['A7'] = X_train['A7'].map(ordinal_mapping)
X_test['A7'] = X_test['A7'].map(ordinal_mapping)

In [7]:
# let's explore the result

X_train['A7'].head(10)

596    0
303    0
204    0
351    1
118    0
247    2
652    0
513    3
230    0
250    4
Name: A7, dtype: int64

### Putting the code in a function

In [8]:
# we can turn the previous commands into 2 functions

def find_category_mappings(df, variable):
    return {k: i for i, k in enumerate(df[variable].unique(), 0)}


def integer_encode(train, test, variable, ordinal_mapping):

    X_train[variable] = X_train[variable].map(ordinal_mapping)
    X_test[variable] = X_test[variable].map(ordinal_mapping)

In [9]:
# and now we run a loop over the remaining categorical variables
# and encode them to numbers

for variable in vars_categorical:
    
    if variable != 'A7':  # we encoded this one already
        
        mappings = find_category_mappings(X_train, variable)
        
        integer_encode(X_train, X_test, variable, mappings)

In [10]:
# let's inspect the results

X_train[vars_categorical].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
596,0,0,0,0,0,0,0,0,0
303,0,0,0,1,0,1,1,1,0
204,1,1,1,2,0,0,0,1,0
351,1,1,1,3,1,1,1,1,0
118,1,0,0,4,0,0,0,0,0


## Ordinal encoding with Scikit-learn

In [11]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [12]:
# let's create an encoder

le = OrdinalEncoder()

In [13]:
# let's fit the encoder to the train set
le.fit(X_train[vars_categorical])

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [14]:
# we can see the unique classes

le.categories_

[array(['Missing', 'a', 'b'], dtype=object),
 array(['Missing', 'l', 'u', 'y'], dtype=object),
 array(['Missing', 'g', 'gg', 'p'], dtype=object),
 array(['Missing', 'aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm',
        'q', 'r', 'w', 'x'], dtype=object),
 array(['Missing', 'bb', 'dd', 'ff', 'h', 'j', 'n', 'o', 'v', 'z'],
       dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['g', 'p', 's'], dtype=object)]

In [15]:
# let's transform train and test sets

X_train_enc = le.transform(X_train[vars_categorical])
X_test_enc = le.transform(X_test[vars_categorical])

In [16]:
#let's inspect the result


pd.DataFrame(X_train_enc, columns=vars_categorical).head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,1.0,2.0,1.0,2.0,8.0,1.0,1.0,1.0,0.0
1,1.0,2.0,1.0,11.0,8.0,0.0,0.0,0.0,0.0
2,2.0,3.0,3.0,13.0,8.0,1.0,1.0,0.0,0.0
3,2.0,3.0,3.0,6.0,3.0,0.0,0.0,0.0,0.0
4,2.0,2.0,1.0,10.0,8.0,1.0,1.0,1.0,0.0


## One hot encoding with Feature-Engine

In [17]:
# let's create the encoder

ordinal_enc = OrdinalCategoricalEncoder(
    encoding_method='arbitrary',
    variables=vars_categorical)

In [18]:
# let's fit the encoder to the train set

ordinal_enc.fit(X_train)

OrdinalCategoricalEncoder(encoding_method='arbitrary',
                          variables=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10',
                                     'A12', 'A13'])

In [19]:
# let's inspect which variables the encoder will encode

ordinal_enc.variables

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [20]:
# in the encoder dict we can observe the numbers
# assigned to each category for all the indicated variables

ordinal_enc.encoder_dict_

{'A1': {'a': 0, 'b': 1, 'Missing': 2},
 'A4': {'u': 0, 'y': 1, 'Missing': 2, 'l': 3},
 'A5': {'g': 0, 'p': 1, 'Missing': 2, 'gg': 3},
 'A6': {'c': 0,
  'q': 1,
  'w': 2,
  'ff': 3,
  'm': 4,
  'i': 5,
  'e': 6,
  'cc': 7,
  'x': 8,
  'd': 9,
  'k': 10,
  'j': 11,
  'Missing': 12,
  'aa': 13,
  'r': 14},
 'A7': {'v': 0,
  'ff': 1,
  'h': 2,
  'dd': 3,
  'z': 4,
  'bb': 5,
  'j': 6,
  'Missing': 7,
  'n': 8,
  'o': 9},
 'A9': {'t': 0, 'f': 1},
 'A10': {'t': 0, 'f': 1},
 'A12': {'t': 0, 'f': 1},
 'A13': {'g': 0, 's': 1, 'p': 2}}

In [21]:
# let's transform the train and test sets

X_train = ordinal_enc.transform(X_train)
X_test = ordinal_enc.transform(X_test)



In [22]:
# let's explore the result

X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0,46.08,3.0,0,0,0,0,2.375,0,0,8,0,0,396.0,4159
303,0,15.92,2.875,0,0,1,0,0.085,1,1,0,1,0,120.0,0
204,1,36.33,2.125,1,1,2,0,0.085,0,0,1,1,0,50.0,1187
351,1,22.17,0.585,1,1,3,1,0.0,1,1,0,1,0,100.0,0
118,1,57.83,7.04,0,0,4,0,14.0,0,0,6,0,0,360.0,1332
