In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os 
# plt.style.use('seaborn-v0_8-colorblind')
# %matplotlib inline
from feature_cleaning import rare_values as ra

## Load dataset

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

# see column Pclass & SibSp's distributions
# SibSp has values 3/8/5 that occur rarely, under 2%
# Pclass has 3 values, but no one is under 20%
data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
for i in ['Pclass','SibSp']:
    print('Variable',i,'label proportion:')
    print(data[i].value_counts()/len(data))

Variable Pclass label proportion:
Pclass
3    0.551066
1    0.242424
2    0.206510
Name: count, dtype: float64
Variable SibSp label proportion:
SibSp
0    0.682379
1    0.234568
2    0.031425
4    0.020202
3    0.017957
8    0.007856
5    0.005612
Name: count, dtype: float64


## Grouping into one new category
Grouping the observations that show rare labels into a unique category ('rare')

In [3]:
# create the encoder and fit with our data
enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)

In [5]:
# let's see the mapping
# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 1%
# for Pclass, nothing changed
print(enc.mapping)

[{'col': 'Pclass', 'mapping': 3    3
1    1
2    2
dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0       0
1       1
2       2
4       4
3       3
8    rare
5    rare
dtype: object, 'data_type': dtype('int64')}]


In [6]:
# perform transformation
data2 = enc.transform(data)

In [7]:
# check the result
print(data2.SibSp.value_counts())

SibSp
0       608
1       209
2        28
4        18
3        16
rare     12
Name: count, dtype: int64


## Mode Imputation
Replacing the rare label by most frequent label

In [8]:
# create the encoder and fit with our data
enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)

In [9]:
# let's see the mapping
# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label
# for Pclass, nothing changed
print(enc.mapping)

[{'col': 'Pclass', 'mapping': 3    3
1    1
2    2
dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0    0
1    1
2    2
4    4
3    3
8    0
5    0
dtype: int64, 'data_type': dtype('int64')}]


In [12]:
# perform transformation
data3 = enc.transform(data)

In [13]:
# check the result
print(data3.SibSp.value_counts())

SibSp
0    620
1    209
2     28
4     18
3     16
Name: count, dtype: int64
