# Data Encoding ---> Converting Categorical data to Numerical data.

* Nominal/One Hot Encoding(OHE)
* Label and Ordinal encoding
* Target Guided Ordinal Encoding

## 1 OHE

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.DataFrame({"Color" : ["red", "green", "blue", "green", "blue", "green", "red", "red", "blue"]})
df

Unnamed: 0,Color
0,red
1,green
2,blue
3,green
4,blue
5,green
6,red
7,red
8,blue


### Converting Categorical data to Numerical Data.

In [3]:
encoder = OneHotEncoder()
temp = encoder.fit_transform(df).toarray()
temp

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [4]:
temp = pd.DataFrame(temp, columns=encoder.get_feature_names_out())
temp

Unnamed: 0,Color_blue,Color_green,Color_red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0
6,0.0,0.0,1.0
7,0.0,0.0,1.0
8,1.0,0.0,0.0


### Disadvantage :
* If there are too many categories, we will end up with many columns.

In [5]:
res_df = pd.concat([df,temp], axis=1)
res_df

Unnamed: 0,Color,Color_blue,Color_green,Color_red
0,red,0.0,0.0,1.0
1,green,0.0,1.0,0.0
2,blue,1.0,0.0,0.0
3,green,0.0,1.0,0.0
4,blue,1.0,0.0,0.0
5,green,0.0,1.0,0.0
6,red,0.0,0.0,1.0
7,red,0.0,0.0,1.0
8,blue,1.0,0.0,0.0


## 2.1 Label Encoding

In [6]:
df = pd.DataFrame({"Color" : ["red", "green", "blue", "green", "blue", "green", "red", "red", "blue", "Orange", "Purple"]})
df

Unnamed: 0,Color
0,red
1,green
2,blue
3,green
4,blue
5,green
6,red
7,red
8,blue
9,Orange


In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
encoder = LabelEncoder()
temp = encoder.fit_transform(df)
temp

  y = column_or_1d(y, warn=True)


array([4, 3, 2, 3, 2, 3, 4, 4, 2, 0, 1])

In [9]:
df1 = pd.DataFrame(temp, columns=["Label"])
df1

Unnamed: 0,Label
0,4
1,3
2,2
3,3
4,2
5,3
6,4
7,4
8,2
9,0


### Disadvantage :
* red = 4, green = 3, blue = 2, Purple = 1, Orange = 0.
* When we are training the model, It will think that red > green > blue > Purple > Orange.

In [10]:
res_df = pd.concat([df,df1], axis=1)
res_df

Unnamed: 0,Color,Label
0,red,4
1,green,3
2,blue,2
3,green,3
4,blue,2
5,green,3
6,red,4
7,red,4
8,blue,2
9,Orange,0


## 2.2 Ordinal Encoding
* Based on Ranking

    * High School = 1
    * College = 2
    * Graduate = 3
    * Post-Graduate = 4

In [11]:
df = pd.DataFrame({"Size" : ["small", "medium", "large", "extra-large", "medium", "large", "small", "extra-large"]})
df

Unnamed: 0,Size
0,small
1,medium
2,large
3,extra-large
4,medium
5,large
6,small
7,extra-large


In [12]:
from sklearn.preprocessing import OrdinalEncoder

In [13]:
encoder = OrdinalEncoder(categories=[["small", "medium", "large", "extra-large"]])
temp = encoder.fit_transform(df)
temp

array([[0.],
       [1.],
       [2.],
       [3.],
       [1.],
       [2.],
       [0.],
       [3.]])

In [14]:
df1 = pd.DataFrame(temp)
df1

Unnamed: 0,0
0,0.0
1,1.0
2,2.0
3,3.0
4,1.0
5,2.0
6,0.0
7,3.0


In [15]:
res_df = pd.concat([df, df1], axis=1)
res_df

Unnamed: 0,Size,0
0,small,0.0
1,medium,1.0
2,large,2.0
3,extra-large,3.0
4,medium,1.0
5,large,2.0
6,small,0.0
7,extra-large,3.0


## 3 Target Guided Ordinal Encoding

In [16]:
df = pd.DataFrame({"City" : ["New York", "London", "Paris", "Tokyo", "New York", "Paris"],
                   "Price" : [200, 150, 300, 250, 180, 320]})
df

Unnamed: 0,City,Price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [17]:
temp = df.groupby(df["City"])["Price"].mean().to_dict()
temp

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [18]:
df["Mean_Price"] = df["City"].map(temp)
df

Unnamed: 0,City,Price,Mean_Price
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0
