In [134]:
import numpy as np
import pandas as pd

In [135]:
df = pd.read_csv('cars.csv')

In [136]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [137]:
# total number of brands
df["brand"].nunique()

32

In [138]:
# total values in fuel
df["fuel"].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [139]:
# total values in owner
df["owner"].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

# 1. OneHotEncoder Using Pandas

In [140]:
pd.get_dummies(df, columns=["fuel", "owner"])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 2. K - 1 One Hot Encoding

In [141]:
# to Solve the problem of multicolinearity, we can drop the first column of each dummy column. 
# for example, there were 4 fuel types. after dropping one, there is only 3 dummy fuel columns.

pd.get_dummies(df, columns=["fuel", "owner"], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 3. OneHotEncoding using Sklearn

### While creating machine learning projects, we generally don't use pandas for OneHotEncoding
### Instead we use sklearn's library called "OneHotEncoder"

In [142]:
""" So using OneHotEncoder from sklearn is bit of pain in the ass cuz well, first you'll
have to 
    1. separate the categorical column
    2. apply OneHotEncoding 
    3. Join the DataFrames together
"""

" So using OneHotEncoder from sklearn is bit of pain in the ass cuz well, first you'll\nhave to \n    1. separate the categorical column\n    2. apply OneHotEncoding \n    3. Join the DataFrames together\n"

In [143]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [144]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

In [145]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [146]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse_output=False)

In [147]:
# categorical columns
categorical_columns = ['fuel', 'owner']

# separate two of the categorical columns and transform it.
# returns numpy array
X_train_new = ohe.fit_transform(X_train[categorical_columns])

# Getting the new column names
encoded_col_names = ohe.get_feature_names_out(categorical_columns)

In [148]:
encoded_col_names

array(['fuel_Diesel', 'fuel_LPG', 'fuel_Petrol',
       'owner_Fourth & Above Owner', 'owner_Second Owner',
       'owner_Test Drive Car', 'owner_Third Owner'], dtype=object)

In [149]:
# convert X_train_new into a dataframe
X_train_new = pd.DataFrame(X_train_new, columns=encoded_col_names)
X_train_new.head()

Unnamed: 0,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
# joining with the original dataFrame
# 1. drop the already existing "fuel" and "owner" column.
# 2. join the new encoded dataframe "X_train_new"

df = df.drop(columns=["fuel", "owner"]).join(X_train_new)

In [151]:
df.head()

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Skoda,120000,370000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Honda,140000,158000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Hyundai,127000,225000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Maruti,120000,130000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
