<a href="https://colab.research.google.com/github/Stonetenth2005/Machine-Learning/blob/main/8-OneHotEncoding/OHE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


# Using pandas to perform One-Hot-Encoding

In [None]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# Using pandas to perform One-Hot-Encoding (but with k-1)

In [None]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


We dont prefer using pandas for One Hot encoding, because it doesn't handle the new values, when encountered in the test data. This breaks the preset OHE, which is already encoded based on the training data.

# Using sklearn to perform One-Hot-Encoding

In [None]:
from sklearn.model_selection import train_test_split
X_test, X_train, Y_test, Y_train = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(drop='first')

In [None]:
X_train_new = OHE.fit_transform(X_train[['fuel', 'owner']]).toarray()
X_test_new = OHE.transform(X_test[['fuel', 'owner']]).toarray()

In [None]:
X_train[['fuel','owner']].values

array([['Diesel', 'First Owner'],
       ['Diesel', 'First Owner'],
       ['Petrol', 'First Owner'],
       ...,
       ['Diesel', 'First Owner'],
       ['Petrol', 'First Owner'],
       ['Diesel', 'Second Owner']], dtype=object)

In [None]:
# horizontally stacking the new OHE columns
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Hyundai', 40000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Mahindra', 70000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 5000, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Maruti', 40000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 2350, 0.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 80000, 1.0, ..., 1.0, 0.0, 0.0]], dtype=object)

# One Hot Encoding for Top Brands, with "Uncommon" replace

In [None]:
counts = df['brand'].value_counts()
threshold = 100

In [None]:
repl = counts[counts <= threshold].values

In [None]:
pd.get_dummies (df['brand'].replace(repl, 'Uncommon'))

Unnamed: 0,Ambassador,Ashok,Audi,BMW,Chevrolet,Daewoo,Datsun,Fiat,Force,Ford,...,Mitsubishi,Nissan,Opel,Peugeot,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8124,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
