In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('./cars.csv')
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
7464,Renault,136000,Diesel,First Owner,600000
882,Chevrolet,70000,Diesel,Second Owner,200000
259,Hyundai,92500,Diesel,First Owner,530000
4777,Mahindra,11500,Diesel,First Owner,819999
7874,Honda,7032,Petrol,First Owner,779000


In [3]:
df.shape

(8128, 5)

In [4]:
ohe = OneHotEncoder(sparse_output=False, drop='first')

In [5]:
ohe.fit(df.iloc[:, 2:4])

In [6]:
arr = list(ohe.get_feature_names_out())
arr

['fuel_Diesel',
 'fuel_LPG',
 'fuel_Petrol',
 'owner_Fourth & Above Owner',
 'owner_Second Owner',
 'owner_Test Drive Car',
 'owner_Third Owner']

In [7]:
df.columns[[0,1,4]].tolist()

['brand', 'km_driven', 'selling_price']

In [8]:
cols = np.concatenate((df.columns[[0,1,4]].tolist(), arr))

In [9]:
np_stack = np.hstack((df.iloc[:, [0,1,4]], ohe.transform(df.iloc[:, 2:4])))
df_new = pd.DataFrame(np_stack, columns=cols)
df_new

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Skoda,120000,370000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Honda,140000,158000,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,Hyundai,127000,225000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Maruti,120000,130000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8124,Hyundai,119000,135000,1.0,0.0,0.0,1.0,0.0,0.0,0.0
8125,Maruti,120000,382000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8126,Tata,25000,290000,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


In [11]:
pd.get_dummies(data=df, columns=['fuel','owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [12]:
# Most frequent columns

In [13]:
counts = df['brand'].value_counts()

In [14]:
least_freq = counts[counts.values < 100].index
least_freq

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [15]:
brand = df['brand'].replace(least_freq, 'uncommon').reset_index().set_index('index')
brand

Unnamed: 0_level_0,brand
index,Unnamed: 1_level_1
0,Maruti
1,Skoda
2,Honda
3,Hyundai
4,Maruti
...,...
8123,Hyundai
8124,Hyundai
8125,Maruti
8126,Tata


In [16]:
brand = pd.DataFrame(ohe.fit_transform(brand), columns=ohe.get_feature_names_out())
brand

Unnamed: 0,brand_Chevrolet,brand_Ford,brand_Honda,brand_Hyundai,brand_Mahindra,brand_Maruti,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_uncommon
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8124,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8125,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
brand.shape

(8128, 12)

In [18]:
pd.get_dummies(df['brand'])

Unnamed: 0,Ambassador,Ashok,Audi,BMW,Chevrolet,Daewoo,Datsun,Fiat,Force,Ford,...,Mitsubishi,Nissan,Opel,Peugeot,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8124,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
