In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')
df.head(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [3]:
df['brand'].value_counts()  # TOO many brands, can't afford having that much extra columns

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

Let's handle it by keeping the frequent brands and merging the uncommon brands into one category.

In [4]:
counts = df['brand'].value_counts()

In [5]:
df['brand'].nunique()
threshold = 100

In [9]:
brands = counts[counts <= threshold].index  # all the brands that are having count below the threshold of 100

In [10]:
df['brand']= df['brand'].replace(brands, 'Uncommon')  # merge them into a single category by replacing their name with 'uncommon'

In [11]:
df['brand'].value_counts()

brand
Maruti        2448
Hyundai       1415
Mahindra       772
Tata           734
Uncommon       538
Toyota         488
Honda          467
Ford           397
Chevrolet      230
Renault        228
Volkswagen     186
BMW            120
Skoda          105
Name: count, dtype: int64

See, a new category is created named Uncommon

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:, -1], test_size=0.2, random_state=42)

In [14]:
X_train.head(10)

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner
5080,Maruti,80000,Petrol,Third Owner
7718,Mahindra,15000,Petrol,First Owner
4526,Volkswagen,60000,Diesel,First Owner
7522,Toyota,50000,Diesel,First Owner
2210,Hyundai,40000,Petrol,First Owner


In [15]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
ohe = OneHotEncoder(drop='first', dtype=np.int32, sparse_output=False)  # first to drop the first column formed to reduce multicollinearity, 

In [59]:
X_train_new = ohe.fit_transform(X_train[['brand', 'fuel', 'owner']])
X_test_new = ohe.transform(X_test[['brand','fuel', 'owner']])

In [60]:
X_train_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

##### Column Transformer is needed to apply encoding to specific columns keeping others intact

In [62]:
ohe.categories_

[array(['BMW', 'Chevrolet', 'Ford', 'Honda', 'Hyundai', 'Mahindra',
        'Maruti', 'Renault', 'Skoda', 'Tata', 'Toyota', 'Uncommon',
        'Volkswagen'], dtype=object),
 array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object),
 array(['First Owner', 'Fourth & Above Owner', 'Second Owner',
        'Test Drive Car', 'Third Owner'], dtype=object)]