When the categorical column has no order (Nominal), we use "one hot encoding"

In [153]:
import numpy as np
import pandas as pd

In [154]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [155]:
df['brand'].nunique()

32

In [156]:
df['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [157]:
df['fuel'].nunique()

4

In [158]:
df['fuel'].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


In [159]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


In [160]:
df['owner'].nunique()

5

In [161]:
pd.get_dummies(df, columns=['fuel','owner']) #multicollinearity

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [162]:
pd.get_dummies(df, columns=['fuel','owner'], drop_first=True) #solved

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False




> pd.get_dummies is good for initial exploration and quick one-hot encoding, while OneHotEncoder is preferred in machine learning pipelines, especially when deploying models, because it can handle new categories and maintain consistency.


In [163]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2, random_state=2)

In [164]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', dtype=np.int32)

In [165]:
X_train_n = ohe.fit_transform(X_train[['fuel','owner']]).toarray() #no need for toarray() if sparse=False in ohe
X_test_n = ohe.transform(X_test[['fuel','owner']]).toarray()

In [166]:
np.hstack((X_train[['brand','km_driven']].values, X_train_n))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

In [167]:
encoded_df = pd.DataFrame(X_train_n, columns=ohe.get_feature_names_out())
df = pd.concat([df, encoded_df], axis=1)
df.drop(['fuel','owner'], axis=1, inplace=True)

In [168]:
df.head()

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Skoda,120000,370000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Honda,140000,158000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Hyundai,127000,225000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Maruti,120000,130000,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [169]:
counts = df['brand'].value_counts()
df['brand'] = df['brand'].apply(lambda x: 'uncommon' if counts[x] < 100 else x)

In [174]:
pd.get_dummies(df['brand']).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
635,False,False,True,False,False,False,False,False,False,False,False,False,False
6231,False,False,False,False,False,False,True,False,False,False,False,False,False
6514,False,False,False,False,False,False,True,False,False,False,False,False,False
6963,False,False,False,False,False,False,False,False,False,False,False,False,True
4083,True,False,False,False,False,False,False,False,False,False,False,False,False
