# Feature Encoding

Feature encoding is the process of transforming `categorical features` into `numeric
features` . This is necessary because machine learning algorithms can only handle
numeric features. There are many different ways to encode categorical features, and
each method has its own advantages and disadvantages. In this notebook, we will explore
some of the most popular methods for encoding categorical features.

- label encoding
- Ordinal encoding
- one-hot encoding
- Binary encoding

In [5]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [8]:
# let's encode the time column using sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder , OneHotEncoder, OrdinalEncoder

le = LabelEncoder()
df['encoded_time'] = le.fit_transform(df[["time"]]) 
df.sample(frac=0.05)



  y = column_or_1d(y, warn=True)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
201,12.74,2.01,Female,Yes,Thur,Lunch,2,1
161,12.66,2.5,Male,No,Sun,Dinner,2,0
147,11.87,1.63,Female,No,Thur,Lunch,2,1
20,17.92,4.08,Male,No,Sat,Dinner,2,0
239,29.03,5.92,Male,No,Sat,Dinner,3,0
85,34.83,5.17,Female,No,Thur,Lunch,4,1
132,11.17,1.5,Female,No,Thur,Lunch,2,1
159,16.49,2.0,Male,No,Sun,Dinner,4,0
72,26.86,3.14,Female,Yes,Sat,Dinner,2,0
16,10.33,1.67,Female,No,Sun,Dinner,3,0


In [9]:
# let's encode the day column using sklearn's OrdinalEncoder
print(df['day'].value_counts())
oe = OrdinalEncoder(categories=[['Thur', 'Fri', 'Sat', 'Sun']], dtype=int)
df["encoded_day"] = oe.fit_transform(df[["day"]])
print(df['day'].value_counts())
df.sample(frac=0.05)

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
45,18.29,3.0,Male,No,Sun,Dinner,2,0,3
192,28.44,2.56,Male,Yes,Thur,Lunch,2,1,0
44,30.4,5.6,Male,No,Sun,Dinner,4,0,3
41,17.46,2.54,Male,No,Sun,Dinner,2,0,3
132,11.17,1.5,Female,No,Thur,Lunch,2,1,0
86,13.03,2.0,Male,No,Thur,Lunch,2,1,0
16,10.33,1.67,Female,No,Sun,Dinner,3,0,3
90,28.97,3.0,Male,Yes,Fri,Dinner,2,0,1
107,25.21,4.29,Male,Yes,Sat,Dinner,2,0,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2,0,2


In [10]:
# let's encode the smoker column using sklearn's OneHotEncoder
ohe = OneHotEncoder()
ohe.fit_transform(df[["smoker"]]).toarray()


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [11]:
#!pip install category_encoders

In [13]:
from category_encoders import BinaryEncoder

be = BinaryEncoder()
be.fit_transform(df['day'])





Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0
