## Count or Frequency Encoding

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train.csv', usecols= ['X1', 'X2'] )

In [10]:
data.describe()

Unnamed: 0,X1,X2
count,4209,4209
unique,27,44
top,aa,as
freq,833,1659


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X1      4209 non-null   object
 1   X2      4209 non-null   object
dtypes: object(2)
memory usage: 65.9+ KB


## One Hot Encoding

In [18]:
len(pd.get_dummies(data).columns)

71

### 71 columns generated. Not an efficient way.
### Can use Count Encoding

In [23]:
for col in data.columns:
    print(col, " : " , data[col].nunique(), " labels")

X1  :  27  labels
X2  :  44  labels


In [24]:
data['X2'].value_counts().head()

as    1659
ae     496
ai     415
m      367
ak     265
Name: X2, dtype: int64

In [25]:
X2_frequency_map = data['X2'].value_counts().to_dict()

In [26]:
X2_frequency_map

{'as': 1659,
 'ae': 496,
 'ai': 415,
 'm': 367,
 'ak': 265,
 'r': 153,
 'n': 137,
 's': 94,
 'f': 87,
 'e': 81,
 'aq': 63,
 'ay': 54,
 'a': 47,
 't': 29,
 'i': 25,
 'k': 25,
 'b': 21,
 'ao': 20,
 'z': 19,
 'ag': 19,
 'd': 18,
 'ac': 13,
 'g': 12,
 'y': 11,
 'ap': 11,
 'x': 10,
 'aw': 8,
 'h': 6,
 'at': 6,
 'an': 5,
 'q': 5,
 'al': 5,
 'ah': 4,
 'p': 4,
 'av': 4,
 'au': 3,
 'af': 1,
 'j': 1,
 'aa': 1,
 'c': 1,
 'am': 1,
 'o': 1,
 'l': 1,
 'ar': 1}

In [29]:
data['X2'] = data['X2'].map(X2_frequency_map)

In [30]:
data['X2']

0          6
1          4
2        137
3        137
4        137
        ... 
4204    1659
4205      29
4206     153
4207      81
4208     496
Name: X2, Length: 4209, dtype: int64

## Advantages
#### Very simple to implement
####  Does not increase feature dimension space

## DisAdvatages
#### If 2 labels have the same count, both will be replaced by same count, will loos valuable information
####  Adds some arbitrary numbers and weights to different labels that may not be related to their predictive power