In [1]:
import pandas as pd
import numpy as np

In [2]:
ds = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2','X3', 'X4','X5', 'X6'])

In [3]:
ds.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [4]:
# lets find out how many unique categories are present in each columns
for column in ds.columns:
  print(f"Column {column} has {len(ds[column].unique())} categories")

Column X1 has 27 categories
Column X2 has 44 categories
Column X3 has 7 categories
Column X4 has 4 categories
Column X5 has 29 categories
Column X6 has 12 categories


In [6]:
# Creating a dictionary with the mappings of categories to numbers
ordinal_mapping = {
    k: i
    for i, k in enumerate(ds['X2'].unique(), 0)
}

# check the mapping
ordinal_mapping

{'at': 0,
 'av': 1,
 'n': 2,
 'e': 3,
 'as': 4,
 'aq': 5,
 'r': 6,
 'ai': 7,
 'ak': 8,
 'm': 9,
 'a': 10,
 'k': 11,
 'ae': 12,
 's': 13,
 'f': 14,
 'd': 15,
 'ag': 16,
 'ay': 17,
 'ac': 18,
 'ap': 19,
 'g': 20,
 'i': 21,
 'aw': 22,
 'y': 23,
 'b': 24,
 'ao': 25,
 'al': 26,
 'h': 27,
 'x': 28,
 'au': 29,
 't': 30,
 'an': 31,
 'z': 32,
 'ah': 33,
 'p': 34,
 'am': 35,
 'j': 36,
 'q': 37,
 'af': 38,
 'l': 39,
 'aa': 40,
 'c': 41,
 'o': 42,
 'ar': 43}

In [7]:
# replacing the labels with the integers

ds['X2'] = ds['X2'].map(ordinal_mapping)


In [9]:
# let's observe the outcome

ds['X2'].tail(10)

4199    17
4200     3
4201    30
4202    13
4203     4
4204     4
4205    30
4206     6
4207     3
4208    12
Name: X2, dtype: int64

In [11]:
# lets create some functions so that we can apply it for all the features

def func_find_category_mappings(ds, col_name):
    return {k: i for i, k in enumerate(ds[col_name].unique(), 0)}


def func_integer_encode(ds, col_name, ordinal_mapping):
    # ordinal mapping is the output of above function
    ds[col_name] = ds[col_name].map(ordinal_mapping)

In [12]:
# lets perform Integer Encoding once again. This time for all the features
for column in ds.columns:
  mapping = func_find_category_mappings(ds,column)
  func_integer_encode(ds, column, mapping)

In [13]:
# Lets see the result of integer encoding
ds.tail(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6
4199,7,17,2,0,28,1
4200,7,3,2,0,28,4
4201,0,30,4,0,28,3
4202,5,13,2,0,28,2
4203,6,4,2,0,28,5
4204,6,4,2,0,28,2
4205,14,30,4,0,28,3
4206,0,6,0,0,28,6
4207,4,3,3,0,28,1
4208,4,12,2,0,28,6


### Advantages
- quick
- returns pandas dataframe

### Limitations of pandas:
- it does not preserve information from train data to propagate to test data
- We need to capture and save the mappings one by one, manually, if we are planing to use those in production.