<a href="https://colab.research.google.com/github/Shivani-781/Machine-Learning-and-its-Applications/blob/master/Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Encoding categorical values**

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/My Drive/ML_Data/Country.csv', index_col=0)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63900.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.5,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
df.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

In [4]:
df['Country'].unique()

array(['France', 'Spain', 'Germany', 'India'], dtype=object)

In [5]:
df['Purchased'].unique()

array(['No', 'Yes'], dtype=object)

## **Label Encoding** - Converts each value in a column to a number

**Using sklearn library**

In [6]:
df1 = df.copy()

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['Purchased'] = le.fit_transform(df1['Purchased'])
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63900.0,1
5,France,35.0,58000.0,1
6,Spain,38.5,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Using Mapping Technique**

In [8]:
df2 = df.copy()
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63900.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.5,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [9]:
df2['Purchased'] = df2['Purchased'].map({'No':0, 'Yes':1})
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63900.0,1
5,France,35.0,58000.0,1
6,Spain,38.5,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


## **One Hot Encoding** - Each category is converted to a new column

**Using sklearn libraries**

In [10]:
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63900.0,1
5,France,35.0,58000.0,1
6,Spain,38.5,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


In [11]:
X = df1[['Country']].values
X

array([['France'],
       ['Spain'],
       ['Germany'],
       ['Spain'],
       ['Germany'],
       ['France'],
       ['Spain'],
       ['France'],
       ['Germany'],
       ['France'],
       ['India']], dtype=object)

In [12]:
from sklearn.preprocessing import OneHotEncoder
oe = OneHotEncoder()
X = pd.DataFrame(oe.fit_transform(X).toarray(), columns=['France', 'Germany', 'India', 'Spain'])
X

Unnamed: 0,France,Germany,India,Spain
0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0
7,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0
9,1.0,0.0,0.0,0.0


In [13]:
df1 = df1.join(X)

In [14]:
df1.drop(['Country'], axis=1, inplace=True)

In [15]:
df1

Unnamed: 0,Age,Salary,Purchased,France,Germany,India,Spain
0,44.0,72000.0,0,1.0,0.0,0.0,0.0
1,27.0,48000.0,1,0.0,0.0,0.0,1.0
2,30.0,54000.0,0,0.0,1.0,0.0,0.0
3,38.0,61000.0,0,0.0,0.0,0.0,1.0
4,40.0,63900.0,1,0.0,1.0,0.0,0.0
5,35.0,58000.0,1,1.0,0.0,0.0,0.0
6,38.5,52000.0,0,0.0,0.0,0.0,1.0
7,48.0,79000.0,1,1.0,0.0,0.0,0.0
8,50.0,83000.0,0,0.0,1.0,0.0,0.0
9,37.0,67000.0,1,1.0,0.0,0.0,0.0


**Using get_dummies**

In [16]:
Y = pd.get_dummies(df2['Country'], prefix=None, prefix_sep=None, columns=['Country'])
Y

Unnamed: 0,France,Germany,India,Spain
0,1,0,0,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,1,0,0
5,1,0,0,0
6,0,0,0,1
7,1,0,0,0
8,0,1,0,0
9,1,0,0,0


In [17]:
df2 = df2.join(Y)

In [18]:
df2

Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,India,Spain
0,France,44.0,72000.0,0,1,0,0,0
1,Spain,27.0,48000.0,1,0,0,0,1
2,Germany,30.0,54000.0,0,0,1,0,0
3,Spain,38.0,61000.0,0,0,0,0,1
4,Germany,40.0,63900.0,1,0,1,0,0
5,France,35.0,58000.0,1,1,0,0,0
6,Spain,38.5,52000.0,0,0,0,0,1
7,France,48.0,79000.0,1,1,0,0,0
8,Germany,50.0,83000.0,0,0,1,0,0
9,France,37.0,67000.0,1,1,0,0,0


In [19]:
df2 = df2.iloc[:, 1:]

In [20]:
df2

Unnamed: 0,Age,Salary,Purchased,France,Germany,India,Spain
0,44.0,72000.0,0,1,0,0,0
1,27.0,48000.0,1,0,0,0,1
2,30.0,54000.0,0,0,1,0,0
3,38.0,61000.0,0,0,0,0,1
4,40.0,63900.0,1,0,1,0,0
5,35.0,58000.0,1,1,0,0,0
6,38.5,52000.0,0,0,0,0,1
7,48.0,79000.0,1,1,0,0,0
8,50.0,83000.0,0,0,1,0,0
9,37.0,67000.0,1,1,0,0,0
