
### Feature Encoding
Feature encoding is the process of converting categorical or textual data into numerical data that can be used by machine learning algorithms. This is important because most machine learning algorithms can only operate on numerical data.


In [1]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns


In [3]:
df=sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
df['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [8]:
# lets encode the time in label encoder
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder
le=LabelEncoder()
df['encoded_time']=le.fit_transform(df['time'])

In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0


In [11]:
df['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [12]:
# now encoding the days of week using ordinal encoder
oe=OrdinalEncoder(categories=[['Thur','Fri','Sat','Sun']])
df['encoded_day']=oe.fit_transform(df[['day']])

In [13]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0


In [14]:
#example of one hot encoding on titanic data set
titanic=sns.load_dataset('titanic')

In [17]:
onehot_encoder=OneHotEncoder(sparse_output=False)
embarked_onehot=onehot_encoder.fit_transform(titanic[['embarked']])
embarked_onehot_df=pd.DataFrame(embarked_onehot,columns=onehot_encoder.get_feature_names_out(['embarked']))
titanic=pd.concat([titanic.reset_index(drop=True),embarked_onehot_df.reset_index(drop=True)])

In [19]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embarked_C,embarked_Q,embarked_S,embarked_nan
0,0.0,3.0,male,22.0,1.0,0.0,7.2500,S,Third,man,True,,Southampton,no,False,,,,
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,,,,
2,1.0,3.0,female,26.0,0.0,0.0,7.9250,S,Third,woman,False,,Southampton,yes,True,,,,
3,1.0,1.0,female,35.0,1.0,0.0,53.1000,S,First,woman,False,C,Southampton,yes,False,,,,
4,0.0,3.0,male,35.0,0.0,0.0,8.0500,S,Third,man,True,,Southampton,no,True,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1777,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0
1778,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0
1779,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0
1780,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0


In [20]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [21]:
from category_encoders import BinaryEncoder


In [22]:
df=sns.load_dataset('tips')


In [26]:
binary_encoder=BinaryEncoder()
df_binary =binary_encoder.fit_transform(df['day'])
df_binary

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


The choice of feature encoding method depends on the characteristics of the data and the requirements of the machine learning model being used. Different encoding methods may be more suitable for different types of data and models.