# <font color="maganta"><h3 align="center">Topic - Encoding Part 1</h3></font> 

<h2 style = "color:brown" >Import Libraries</h2>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

<h2 style = "color:brown" >Data Reading</h2>

In [2]:
df = pd.read_csv('supershops.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [4]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()

In [5]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [6]:
df.Area.value_counts()

Dhaka      17
Ctg        17
Rangpur    16
Name: Area, dtype: int64

<h1 style = "color:indigo">Encoding Techniques in ML</h1>

1. Label Encoder<br>
2. One Hot Encoder <br>
3. Ordinal Encoder

<h1 style = "color:green">1. Label Encoder</h1>

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [8]:
df3.Area = le.fit_transform(df[['Area']])
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [9]:
le.classes_

array(['Ctg', 'Dhaka', 'Rangpur'], dtype=object)

In [10]:
le.transform(le.classes_)

array([0, 1, 2])

## Decode the encoded values back to original classes

In [11]:
#df3.Area = le.inverse_transform(df3.Area)
decoded_classes = le.inverse_transform(df3.Area)
decoded_classes

array(['Dhaka', 'Ctg', 'Rangpur', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg',
       'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Ctg', 'Rangpur', 'Ctg',
       'Rangpur', 'Dhaka', 'Ctg', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg',
       'Dhaka', 'Rangpur', 'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Dhaka',
       'Rangpur', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Ctg',
       'Dhaka', 'Rangpur', 'Ctg', 'Dhaka', 'Ctg', 'Ctg', 'Rangpur', 'Ctg',
       'Dhaka', 'Ctg', 'Dhaka', 'Rangpur', 'Ctg', 'Dhaka', 'Ctg'],
      dtype=object)

<h2 style = "color:purple">Label Encoder with Label Mapping</h2>

In [12]:
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [13]:
label_mapping = {'Dhaka':0,'Ctg':1,'Rangpur': 2}

In [14]:
label_mapping.keys()

dict_keys(['Dhaka', 'Ctg', 'Rangpur'])

In [15]:
label_mapping.values()

dict_values([0, 1, 2])

<h2 style = "color:purple">Method 1: Using Map Function</h2>

In [16]:
df4.Area = df4.Area.map(label_mapping) 

In [17]:
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


<h2 style = "color:purple">Method 2: Using List Comprehension</h2>

In [18]:
df5 = df.copy()
df5.Area = [label_mapping[area] for area in df5.Area] #using list comprehension

#df5.Area = [label_mapping[x] for x in df5.Area]

In [19]:
df5.head(3)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39


<h2 style = "color:purple">Method 3: Replace Method</h2>

In [20]:
df6 = df.copy()
df6.Area = df6.Area.replace(label_mapping)
df6.head(3)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39


<h1 style = "color:green">2. One Hot Encoder</h1>

<h3 style = "color:coral">2.1 Dummy Variables</h3>

In [21]:
df7 = df.copy()
dummy = pd.get_dummies(df7.Area)
dummy.head()

Unnamed: 0,Ctg,Dhaka,Rangpur
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [22]:
dummy = pd.get_dummies(df7.Area, prefix = 'Area', prefix_sep = '_')
#dummy = pd.get_dummies(df7.Area, prefix = 'Area') # default prefix_sep = '_'
dummy.head()

Unnamed: 0,Area_Ctg,Area_Dhaka,Area_Rangpur
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [23]:
new_df7 = pd.concat([df7,dummy], axis=1)
new_df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Area_Ctg,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0,1,0
1,162597.7,151377.59,443898.53,Ctg,191792.06,1,0,0
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0,0,1
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0,1,0
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0,0,1


In [24]:
#drop a column to avoid multicollinearity
dummy2 = pd.get_dummies(df7.Area, drop_first = True, prefix = 'Area')
dummy2.head()

Unnamed: 0,Area_Dhaka,Area_Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [25]:
#new2_df7 = pd.concat([df7,pd.get_dummies(df7.Area, drop_first = True, prefix = 'Area', prefix_sep = '_')], axis=1)
new2_df7 = pd.concat([df7.drop('Area',axis = 1),dummy2], axis=1)
new2_df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


<h3 style = "color:coral">Easiest Way</h3>

In [26]:
new_dummy_encoded = pd.get_dummies(df7)
new_dummy_encoded.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Ctg,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,0,1,0
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,0,1
3,144372.41,118671.85,383199.62,182901.99,0,1,0
4,142107.34,91391.77,366168.42,166187.94,0,0,1


In [27]:
new_dummy_encoded2 = pd.get_dummies(df7,prefix='', prefix_sep = '')
new_dummy_encoded2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Ctg,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,0,1,0
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,0,1
3,144372.41,118671.85,383199.62,182901.99,0,1,0
4,142107.34,91391.77,366168.42,166187.94,0,0,1


<h3 style = "color:coral">2.2 OneHot Encoder()</h3>

In [28]:
from sklearn.preprocessing import OneHotEncoder

<h4 style = "color:indigo">1st Way (sparse=False)</h4>

In [30]:
hot = OneHotEncoder(sparse=False)

hot_data = hot.fit_transform(df7[['Area']])
hot_df = pd.DataFrame(hot_data, columns=hot.get_feature_names_out(['Area']))
df_encoded = pd.concat([df7, hot_df], axis=1)
df_encoded.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Area_Ctg,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.0,1.0,0.0
1,162597.7,151377.59,443898.53,Ctg,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0.0,0.0,1.0
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0.0,1.0,0.0
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0.0,0.0,1.0


<h4 style = "color:indigo"> 2nd Way</h4>

In [31]:
hot2 = OneHotEncoder()

hot2_data = hot2.fit_transform(df7[['Area']])
hot2_df = pd.DataFrame(hot2_data.toarray(), columns=hot2.get_feature_names_out(['Area']))
df_encoded2 = pd.concat([df7, hot2_df], axis=1)
df_encoded2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Area_Ctg,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.0,1.0,0.0
1,162597.7,151377.59,443898.53,Ctg,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0.0,0.0,1.0
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0.0,1.0,0.0
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0.0,0.0,1.0


<h4 style = "color:indigo">Rename columns(remove unwanted prefix)</h4>

In [32]:
hot2 = OneHotEncoder()

hot2_data = hot2.fit_transform(df7[['Area']])
hot2_df = pd.DataFrame(hot2_data.toarray(), columns=hot2.get_feature_names_out(['Area']))

#rename columns(remove unwanted prefix)
hot2_df.columns = hot2_df.columns.str.split('_').str[1]
df_encoded2 = pd.concat([df7, hot2_df], axis=1)
df_encoded2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Ctg,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.0,1.0,0.0
1,162597.7,151377.59,443898.53,Ctg,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0.0,0.0,1.0
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0.0,1.0,0.0
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0.0,0.0,1.0


<h1 style = "color:green">3. Ordinal Encoder</h1>

In [33]:
from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder()

In [34]:
df8 = df.copy()
df8.Area = ordinal.fit_transform(df8[['Area']])
df8.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1.0,192261.83
1,162597.7,151377.59,443898.53,0.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,1.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


In [35]:
area = ['Bogra','Ctg', 'Dhaka', 'Rangpur']
ordinal2 = OrdinalEncoder(categories = [area])

In [36]:
df9 = df.copy()
df9.Area = ordinal2.fit_transform(df9[['Area']])
df9.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,2.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,3.0,191050.39
3,144372.41,118671.85,383199.62,2.0,182901.99
4,142107.34,91391.77,366168.42,3.0,166187.94
