# Rare category grouping

In [1]:
import numpy as np
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# for encoding using feature-engine
from feature_engine.encoding import RareLabelEncoder

In [2]:
# let's load the data set

data = pd.read_csv("credit_approval_uci.csv")

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# Let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## Grouping categories with pandas

In [4]:
# let's inspect the frequency of labels in A7

freqs = X_train["A7"].value_counts(normalize=True)

freqs

v          0.573499
h          0.209110
ff         0.084886
bb         0.080745
z          0.014493
dd         0.010352
j          0.010352
Missing    0.008282
n          0.006211
o          0.002070
Name: A7, dtype: float64

In [5]:
# frequent categories in A7

frequent_cat = [x for x in freqs.loc[freqs > 0.05].index.values]

frequent_cat

['v', 'h', 'ff', 'bb']

In [6]:
# rare categories in A7

[x for x in X_train["A7"].unique() if x not in frequent_cat]

['dd', 'z', 'j', 'Missing', 'n', 'o']

In [7]:
# let's group rare labels into a new category called rare

X_train["A7"] = np.where(X_train["A7"].isin(frequent_cat), X_train["A7"], "Rare")

X_test["A7"] = np.where(X_test["A7"].isin(frequent_cat), X_test["A7"], "Rare")

In [8]:
# let's inspect the results

X_train["A7"].value_counts(normalize=True)

v       0.573499
h       0.209110
ff      0.084886
bb      0.080745
Rare    0.051760
Name: A7, dtype: float64

## Grouping categories with Feature-engine

In [9]:
# Let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

In [10]:
# Set up the encoder to group categories present
# in less than 5% of the observations.

# Group only categories in variables with more
# than 4 unique categories

rare_encoder = RareLabelEncoder(tol=0.05, n_categories=4)

In [11]:
# let's fit the encoder to the train set

rare_encoder.fit(X_train)



RareLabelEncoder(n_categories=4)

In [12]:
rare_encoder.variables_

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [13]:
# in the encoder_dict_ we can observe the number of
# observations per category for each variable

rare_encoder.encoder_dict_

{'A1': array(['a', 'b', 'Missing'], dtype=object),
 'A4': array(['u', 'y', 'Missing', 'l'], dtype=object),
 'A5': array(['g', 'p', 'Missing', 'gg'], dtype=object),
 'A6': Index(['c', 'q', 'w', 'i', 'ff', 'k', 'aa', 'cc', 'm'], dtype='object'),
 'A7': Index(['v', 'h', 'ff', 'bb'], dtype='object'),
 'A9': array(['t', 'f'], dtype=object),
 'A10': array(['t', 'f'], dtype=object),
 'A12': array(['t', 'f'], dtype=object),
 'A13': array(['g', 's', 'p'], dtype=object)}

In [14]:
# let's transform the train and test sets

X_train_enc = rare_encoder.transform(X_train)
X_test_enc = rare_encoder.transform(X_test)

In [15]:
X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


In [16]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,a,45.83,10.5,u,g,q,v,5.0,t,t,7,t,g,0.0,0
586,b,64.08,20.0,u,g,Rare,h,17.5,t,t,9,t,g,0.0,1000
140,a,31.25,3.75,u,g,cc,h,0.625,t,t,9,t,g,181.0,0
492,b,39.25,9.5,u,g,m,v,6.5,t,t,14,f,g,240.0,4607
350,a,26.17,2.0,u,g,Rare,Rare,0.0,f,f,0,t,g,276.0,1
