# One hot encoding - frequent categories

In [1]:
import pandas as pd
import numpy as np

# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder

In [2]:
# let's load the data set

data = pd.read_csv("credit_approval_uci.csv")

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## One-hot encoding of top categories with pandas

In [4]:
# let's inspect the unique categories of A6

X_train["A6"].unique()

array(['c', 'q', 'w', 'ff', 'm', 'i', 'e', 'cc', 'x', 'd', 'k', 'j',
       'Missing', 'aa', 'r'], dtype=object)

In [5]:
# let's find the top 5 most frequent categories in A6

X_train["A6"].value_counts().sort_values(ascending=False).head(5)

c     93
q     56
w     48
i     41
ff    38
Name: A6, dtype: int64

In [6]:
# let's make a list with the most frequent categories in A6

top_5 = [
    x for x in X_train["A6"].value_counts().sort_values(ascending=False).head(5).index
]

top_5

['c', 'q', 'w', 'i', 'ff']

In [7]:
# Now let's create 5 binary variables to encode A6
# in train and test sets

for label in top_5:

    X_train[f"A6_{label}"] = np.where(X_train["A6"] == label, 1, 0)

    X_test[f"A6_{label}"] = np.where(X_test["A6"] == label, 1, 0)

In [8]:
# let's visualise the result

print(X_train[["A6"] + [f"A6_{label}" for label in top_5]].head(10))

     A6  A6_c  A6_q  A6_w  A6_i  A6_ff
596   c     1     0     0     0      0
303   q     0     1     0     0      0
204   w     0     0     1     0      0
351  ff     0     0     0     0      1
118   m     0     0     0     0      0
247   q     0     1     0     0      0
652   i     0     0     0     1      0
513   e     0     0     0     0      0
230  cc     0     0     0     0      0
250   e     0     0     0     0      0


## One-hot encoding of top categories with Feature-engine

In [9]:
# let's divide in train and test sets (again)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [10]:
ohe_enc = OneHotEncoder(
    top_categories=5,  # the number of popular categories
    variables=["A6", "A7"],  # the variables to encode
)

ohe_enc.fit(X_train)

OneHotEncoder(top_categories=5, variables=['A6', 'A7'])

In [11]:
# the encoder stores the variables it will encode

ohe_enc.variables_

['A6', 'A7']

In [12]:
# the encoder stores the most frequent labels per variable

ohe_enc.encoder_dict_

{'A6': ['c', 'q', 'w', 'i', 'ff'], 'A7': ['v', 'h', 'ff', 'bb', 'z']}

In [13]:
# let's transform train and test sets

X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [14]:
# let's inspect the result

X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A8,A9,A10,A11,A12,...,A6_c,A6_q,A6_w,A6_i,A6_ff,A7_v,A7_h,A7_ff,A7_bb,A7_z
596,a,46.08,3.0,u,g,2.375,t,t,8,t,...,1,0,0,0,0,1,0,0,0,0
303,a,15.92,2.875,u,g,0.085,f,f,0,f,...,0,1,0,0,0,1,0,0,0,0
204,b,36.33,2.125,y,p,0.085,t,t,1,f,...,0,0,1,0,0,1,0,0,0,0
351,b,22.17,0.585,y,p,0.0,f,f,0,f,...,0,0,0,0,1,0,0,1,0,0
118,b,57.83,7.04,u,g,14.0,t,t,6,t,...,0,0,0,0,0,1,0,0,0,0


In [15]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A8,A9,A10,A11,A12,...,A6_c,A6_q,A6_w,A6_i,A6_ff,A7_v,A7_h,A7_ff,A7_bb,A7_z
14,a,45.83,10.5,u,g,5.0,t,t,7,t,...,0,1,0,0,0,1,0,0,0,0
586,b,64.08,20.0,u,g,17.5,t,t,9,t,...,0,0,0,0,0,0,1,0,0,0
140,a,31.25,3.75,u,g,0.625,t,t,9,t,...,0,0,0,0,0,0,1,0,0,0
492,b,39.25,9.5,u,g,6.5,t,t,14,f,...,0,0,0,0,0,1,0,0,0,0
350,a,26.17,2.0,u,g,0.0,f,f,0,t,...,0,0,0,0,0,0,0,0,0,0
