# ONE HOT ENCODING

- used on nominal data
- dummy variable technique is used
- one column is dropped after encoding to avoid multicollinearity
- if there are too many categories , the most frequent categories and encoded separately 
   and less frequent categories are combined into the 'other' column to reduce dimensionalty

In [2]:
# IMPORTS
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("C:\\Users\\siddh\\OneDrive\\Desktop\\Data science data\\Preprocessing\\cars.csv")
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


## One-Hot encoding using pandas

In [4]:
# APPLYING ON FUEL AND OWNER COLUMN

In [6]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## ONE HOT ENCODING USING sklearn

In [8]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [18]:
# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X = df.drop(columns=['selling_price'])
y = df['selling_price']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [19]:
# IMPORTING ONE HOT ENCODER
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [20]:
# FITTING AND TRANSFORMING TRAINING DATA
X_train_encoded = ohe.fit_transform(X_train[['fuel','owner']])

In [21]:
# TRANSFORMING TEST DATA
X_test_encoded = ohe.transform(X_test[['fuel','owner']])

In [27]:
# APPENDING REMAINING COLUMNS FROM DATA TO ENCODED DATA
np.hstack((X_train[['brand','km_driven']].values,X_train_encoded))

array([['Mahindra', 110000, 1, ..., 1, 0, 0],
       ['Maruti', 40000, 0, ..., 0, 0, 0],
       ['Volkswagen', 110000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 80000, 1, ..., 1, 0, 0],
       ['Maruti', 40000, 0, ..., 1, 0, 0],
       ['Maruti', 80000, 0, ..., 0, 0, 0]], dtype=object)

## OHE with top categories

In [28]:
# BRAND COLUMN


In [30]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [32]:
# VALUE COUNTS
counts = df['brand'].value_counts()

In [34]:
df['brand'].nunique()
threshold = 100

In [42]:
# BRANDS HAVING CARS LESS THAN THRESHOLD=100
repl = counts[counts <= threshold].index

In [43]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
