### Feature Encoding

#### What is Encoding?
        - Representing categorical data into numeric data - Label Encoding
        - e.g Sardar = 1, Shahzad = 2, Shahzeb = 3, Shahzain = 4
        - nominal, ordinal encoding etc

    Types of Encoding:
        - Label Encoding: unordered
        - One Hot Encoding: one hot value in each column, creates identity matrix
        - Ordinal Encoding: ordered
        - Binary Encoding: Categories are converted into numerical labels, then those are covered into binary codes
        - Frequency/Count Encoding: assigns the values their frequency counts

#### ---------------------------------
#### Why do we need to Encode the data?
        - mostly algorithms take/prefer numerical data
        - computers prefers numbers over data
        - text is lengthy, numbers are small, saves computational power
        
        - Algorithm Compatibility
            - mostly algorithms take/prefer numerical data
        - Efficiency & Performance:
            - Faster computation of numeric data, storage efficiency
        - Feature Representation
            - Same numeric code for a variable in multiple languages -> universal represntation
        - Support Unseen Categories
        - Better Memory Usage
#### ----------------------------------

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# data load
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [5]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [6]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [7]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [8]:
# Encoding time variable in label encoder

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

le = LabelEncoder()

df['encoded_time'] = le.fit_transform(df['time'])

In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0


In [10]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [11]:
df['encoded_time'].value_counts()

encoded_time
0    176
1     68
Name: count, dtype: int64

In [12]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [13]:
# ORdinal encoding the days using specific order

oe = OrdinalEncoder(categories=[['Thur', 'Fri', 'Sat', 'Sun']])
### oe = OrdinalEncoder() also works and assigns on descending order

df['encoded_days'] = oe.fit_transform(df[['day']])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_days
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0


In [14]:
df['encoded_days'].value_counts()

encoded_days
2.0    87
3.0    76
0.0    62
1.0    19
Name: count, dtype: int64

In [18]:
# One Hot Encoding
ohe = OneHotEncoder()
ohe.fit_transform(df[['sex']]).toarray()

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [None]:
# !pip install category_encoders

In [20]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [21]:
from category_encoders import BinaryEncoder

binary_encoder = BinaryEncoder()
tips_binary = binary_encoder.fit_transform(tips['day'])
tips_binary

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


### Feature Encoding with Pandas

In [22]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [24]:
pd.get_dummies(df['day'])

Unnamed: 0,Thur,Fri,Sat,Sun
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
...,...,...,...,...
239,False,False,True,False
240,False,False,True,False
241,False,False,True,False
242,False,False,True,False


In [26]:
pd.get_dummies(df['sex'])

Unnamed: 0,Male,Female
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
...,...,...
239,True,False
240,False,True
241,True,False
242,True,False
