In [119]:
import numpy as np
import pandas as pd

In [120]:
df = pd.read_csv('cars.csv')

In [121]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


# One Hot Encoding using Pandas

In [122]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# K-1 OneHotEncoding  (removing dummy column to remove Multicollinearity)

In [123]:
# Here notice that the first column i.e [fuel_CNG] and [owner_First Owner] of encoded variables i.e 'fuel', 'owner'  gets removed to avoid multicollinearity
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [124]:
# But For meachine learning projects OneHotEncoding with pandas is not a good method
# SO, we will use scikit-learn for OneHotEncoding

# OneHotEncoding using scikit-learn

In [125]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


## Train_test Split

In [126]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2, random_state=0)

In [127]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner
...,...,...,...,...
4931,Tata,70000,Diesel,Third Owner
3264,Ford,100000,Diesel,Second Owner
1653,Hyundai,90000,Petrol,Second Owner
2607,Volkswagen,90000,Diesel,First Owner


In [128]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output = False, dtype=int)
# drop='first'   Drops the first category of each categorical feature to prevent Multicollinearity
# sparse_output=False   Returns a dense NumPy array instead of a sparse matrix.
# dtype=int     Output data type (int instead of float)

# Since, we are applying one hot encoding to only 'Fuel' and 'owner' column so
ohe.fit(X_train[['fuel', 'owner']])

X_train_new = ohe.transform(X_train[['fuel', 'owner']])
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [129]:
# Extracting column names for later use
encoded_cols = ohe.get_feature_names_out()

In [130]:
encoded_cols

array(['fuel_Diesel', 'fuel_LPG', 'fuel_Petrol',
       'owner_Fourth & Above Owner', 'owner_Second Owner',
       'owner_Test Drive Car', 'owner_Third Owner'], dtype=object)

In [131]:
X_train_new

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], shape=(6502, 7))

In [132]:
# Converting this array into dataframe with proper column names
X_train_new_df = pd.DataFrame(X_train_new, columns=[encoded_cols])

In [133]:
X_train_new_df

Unnamed: 0,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,1
2,1,0,0,0,1,0,0
3,1,0,0,0,1,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
6497,1,0,0,0,0,0,1
6498,1,0,0,0,1,0,0
6499,0,0,1,0,1,0,0
6500,1,0,0,0,0,0,0


In [134]:
# Now append these with the brand and km_driven  column

In [135]:
X_train[['brand', 'km_driven']].reset_index(drop = True)

Unnamed: 0,brand,km_driven
0,Hyundai,60000
1,Tata,150000
2,Hyundai,110000
3,Mahindra,28000
4,Maruti,15000
...,...,...
6497,Tata,70000
6498,Ford,100000
6499,Hyundai,90000
6500,Volkswagen,90000


In [136]:
final_df = pd.concat([X_train[['brand', 'km_driven']].reset_index(drop = True), X_train_new_df], axis=1)

In [137]:
final_df

Unnamed: 0,brand,km_driven,"(fuel_Diesel,)","(fuel_LPG,)","(fuel_Petrol,)","(owner_Fourth & Above Owner,)","(owner_Second Owner,)","(owner_Test Drive Car,)","(owner_Third Owner,)"
0,Hyundai,60000,0,1,0,0,0,0,0
1,Tata,150000,1,0,0,0,0,0,1
2,Hyundai,110000,1,0,0,0,1,0,0
3,Mahindra,28000,1,0,0,0,1,0,0
4,Maruti,15000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6497,Tata,70000,1,0,0,0,0,0,1
6498,Ford,100000,1,0,0,0,1,0,0
6499,Hyundai,90000,0,0,1,0,1,0,0
6500,Volkswagen,90000,1,0,0,0,0,0,0


# OneHotEncoding for categorical data with many categories

In [138]:
# Now lets perform nominal encoding on brands column

In [139]:
#The core problem with Brands column is ---
# Your brand column probably has:
# 15-20+ unique brands

# One-Hot Encoding all of them would:
# Create too many columns
# Increase model complexity
# Risk overfitting
# Slow training

# So the solution to this is 
# Keep only important / frequent brands, group the rest as "Other"

In [140]:
# These are top 5 brands.. So we will only keep these top brands and replace other brands as 'Other'
df['brand'].value_counts().head(5)

brand
Maruti      2448
Hyundai     1415
Mahindra     772
Tata         734
Toyota       488
Name: count, dtype: int64

In [141]:
top_brands = df['brand'].value_counts().head(5).index

In [142]:
df['brands_reduced'] = df['brand'].apply(lambda x: x if x in top_brands else 'Other')

In [143]:
df['brands_reduced'].value_counts()

brands_reduced
Maruti      2448
Other       2271
Hyundai     1415
Mahindra     772
Tata         734
Toyota       488
Name: count, dtype: int64

In [144]:
from sklearn.preprocessing import OneHotEncoder

In [145]:
ohe_brands = OneHotEncoder(drop='first', sparse_output=False, dtype=int)

In [146]:
brands_encoded = ohe_brands.fit_transform(df[['brands_reduced']])

In [147]:
brands_encoded

array([[0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]], shape=(8128, 5))

In [148]:
# Extracting column names for later use
brands_cols = ohe_brands.get_feature_names_out()

In [149]:
brands_cols

array(['brands_reduced_Mahindra', 'brands_reduced_Maruti',
       'brands_reduced_Other', 'brands_reduced_Tata',
       'brands_reduced_Toyota'], dtype=object)

In [150]:
brand_dataset = pd.DataFrame(brands_encoded, columns=['brand_Mahindra', 'brand_Maruti', 'Other_brands', 'brand_Tata', 'brand_Toyata'])

In [151]:
brand_dataset

Unnamed: 0,brand_Mahindra,brand_Maruti,Other_brands,brand_Tata,brand_Toyata
0,0,1,0,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
8123,0,0,0,0,0
8124,0,0,0,0,0
8125,0,1,0,0,0
8126,0,0,0,1,0
