In [6]:
import pandas as pd

data = pd.read_csv('../data/bank+marketing/bank/bank.csv', sep=";")

In [24]:
list(data.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [8]:
# y
data['y'].value_counts()

# no = 1, yes = 2

y
no     4000
yes     521
Name: count, dtype: int64

### Label Encoding

In [12]:
import category_encoders as ce

ce.OrdinalEncoder(cols=['education']).fit_transform(data)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,1,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,2,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,3,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,3,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,2,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,2,0,-333,1,0,cellular,30,jul,329,5,-1,0,unknown,0
4517,57,self-employed,married,3,1,-3313,1,1,unknown,9,may,153,1,-1,0,unknown,0
4518,57,technician,married,2,0,295,0,0,cellular,19,aug,151,11,-1,0,unknown,0
4519,28,blue-collar,married,2,0,1137,0,0,cellular,6,feb,129,4,211,3,other,0


### Label Encoding ( Custom Mapping )

In [10]:
import category_encoders as ce

yesNoCols = ['default', 'housing', 'loan', 'y']

mapping = [
    *list(map(lambda x: {'col': x, 'mapping': {None: 0, 'no': 0, 'yes': 1}}, yesNoCols))
]

data = ce.OrdinalEncoder(cols=yesNoCols, mapping=mapping).fit_transform(data)
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,0,-333,1,0,cellular,30,jul,329,5,-1,0,unknown,0
4517,57,self-employed,married,tertiary,1,-3313,1,1,unknown,9,may,153,1,-1,0,unknown,0
4518,57,technician,married,secondary,0,295,0,0,cellular,19,aug,151,11,-1,0,unknown,0
4519,28,blue-collar,married,secondary,0,1137,0,0,cellular,6,feb,129,4,211,3,other,0


### Ordinal Encoding (Label Encoding)

In [13]:
data['month'].value_counts()

month
may    1398
jul     706
aug     633
jun     531
nov     389
apr     293
feb     222
jan     148
oct      80
sep      52
mar      49
dec      20
Name: count, dtype: int64

In [None]:
import category_encoders as ce

mapping = [
    {
        'col': 'month',
        'mapping': {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9,
                    'oct': 10, 'nov': 11, 'dec': 12}
    },
]

data = ce.OrdinalEncoder(cols=['month'], mapping=mapping).fit_transform(data)
data

In [None]:
data['education'].value_counts()
# unknown = 0, primary = 1, secondary = 2, tertiary = 3 Label Encoding / Ordinal Encoding

### One-Hot Encoding

In [14]:
data['job'].value_counts()

job
management       969
blue-collar      946
technician       768
admin.           478
services         417
retired          230
self-employed    183
entrepreneur     168
unemployed       128
housemaid        112
student           84
unknown           38
Name: count, dtype: int64

In [26]:
import category_encoders as ce

result = ce.OneHotEncoder(
    cols=['job'],
    use_cat_names=True,
    drop_invariant=True,
    handle_missing='ignore'
).fit_transform(data[['job']])

pd.concat([data[['job']], result], axis=1)
# result

Unnamed: 0,job,job_unemployed,job_services,job_management,job_blue-collar,job_self-employed,job_technician,job_entrepreneur,job_admin.,job_student,job_housemaid,job_retired,job_unknown
0,unemployed,1,0,0,0,0,0,0,0,0,0,0,0
1,services,0,1,0,0,0,0,0,0,0,0,0,0
2,management,0,0,1,0,0,0,0,0,0,0,0,0
3,management,0,0,1,0,0,0,0,0,0,0,0,0
4,blue-collar,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,services,0,1,0,0,0,0,0,0,0,0,0,0
4517,self-employed,0,0,0,0,1,0,0,0,0,0,0,0
4518,technician,0,0,0,0,0,1,0,0,0,0,0,0
4519,blue-collar,0,0,0,1,0,0,0,0,0,0,0,0


### Binary Encoding (Label + One-Hot)

In [25]:
import category_encoders as ce

result = ce.BinaryEncoder(
    cols=['job'],
    drop_invariant=True,
    handle_missing='ignore'
).fit_transform(data[['job']])

pd.concat([data[['job']], result], axis=1)

Unnamed: 0,job,job_0,job_1,job_2,job_3
0,unemployed,0,0,0,1
1,services,0,0,1,0
2,management,0,0,1,1
3,management,0,0,1,1
4,blue-collar,0,1,0,0
...,...,...,...,...,...
4516,services,0,0,1,0
4517,self-employed,0,1,0,1
4518,technician,0,1,1,0
4519,blue-collar,0,1,0,0
