In [54]:
import pandas as pd

data = pd.read_csv('../data/bank+marketing/bank/bank.csv', sep=';', index_col=0)

In [55]:
data

Unnamed: 0_level_0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


### Encoding

#### Label Encoder
Assigns numeric number(label) to each category in the column, eg: yes = 1, no = 0
Usually best for columns where categories are ordinal(has some incremental succession. Like:
hot/hotter/hottest
cold/warm/hot
red/orange/yellow
Eg:
```python
data['is_default'] = data['default'].apply(lambda x: 1 if x == 'yes' else 0)
```

In [56]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

columns_to_convert = ['default', 'housing', 'loan', 'y']
for col in columns_to_convert:
    data[f'is_{col}'] = label_encoder.fit_transform(data[col])

#### One-Hot Encoder (Dummy)
Creates new **binary** columns per category. Eg: marital column with categories, married / single / divorced
 creates 3 new columns with all but one column being "1" at all times; hence "One-Hot"
Suited for non-ordinal

In [57]:
data['marital'].value_counts()

marital
married     2797
single      1196
divorced     528
Name: count, dtype: int64

In [65]:
marital_dummies = pd.get_dummies(data['marital'], prefix='marital', dtype=int, drop_first=True)
data = pd.concat([data, marital_dummies], axis=1)

pd.concat([data['marital'], marital_dummies], axis=1) # For preview only

Unnamed: 0_level_0,marital,marital_married,marital_single
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,married,1,0
33,married,1,0
35,single,0,1
30,married,1,0
59,married,1,0
...,...,...,...
33,married,1,0
57,married,1,0
57,married,1,0
28,married,1,0


In [None]:
data['job'].value_counts()

#### Binary Encoder

This encoder first assigns numeric labels to **non-ordinal** categories like in LabelEncoder.
Then to avoid the training from treating the labels as ordinal, it finds the binary representation
of those numeric labels. Which then gives us new columns with a column count equal to the number
of bits in the highest label.

A combination of **Label** and **One-Hot** encoders for non-ordinal data

In [64]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['job'])
encoded = encoder.fit_transform(data['job'])
pd.concat([data['job'], encoded], axis=1)

Unnamed: 0_level_0,job,job_0,job_1,job_2,job_3
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,unemployed,0,0,0,1
33,services,0,0,1,0
35,management,0,0,1,1
30,management,0,0,1,1
59,blue-collar,0,1,0,0
...,...,...,...,...,...
33,services,0,0,1,0
57,self-employed,0,1,0,1
57,technician,0,1,1,0
28,blue-collar,0,1,0,0


#### Ordinal Encoder
Similar to Label Encoding but with a user-defined specific order to the categories.
```python
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[['cold', 'warm', 'hot']])
data['temperature'] = ordinal_encoder.fit_transform(data[['temperature']])
```