In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [111]:
import random

# Generating random data for the cars.csv file
brands = ['Toyota', 'Honda', 'TATA', 'Mahindra', 'Hyundai', 'Skoda','Volvo','Fiat','Jeep','Jaguar','Audi', 'Volkswagen', 'Kia', 'BMW', 'Mercedes']
km_driven = [random.randint(5000, 200000) for _ in range(5000)]
fuel = random.choices(['Diesel', 'Petrol','CNG','LPG'], k=5000)
owner = random.choices(['First Owner', 'Second Owner', 'Third Owner','Forth & Above Owner'], k=5000)
selling_price = [random.randint(100000, 500000) for _ in range(5000)]

# Creating a DataFrame
cars_df = pd.DataFrame({
    'Brand': random.choices(brands, k=5000),
    'Km_Driven': km_driven,
    'Fuel': fuel,
    'Owner': owner,
    'Selling_Price': selling_price
})

# Saving to a CSV file
file_path = 'cars.csv'
cars_df.to_csv(file_path, index=False)

file_path

'cars.csv'

In [112]:
df =pd.read_csv('cars.csv')

In [113]:
df.head()

Unnamed: 0,Brand,Km_Driven,Fuel,Owner,Selling_Price
0,Hyundai,31646,CNG,Third Owner,358505
1,Honda,41754,Diesel,Forth & Above Owner,104979
2,Hyundai,6316,LPG,Forth & Above Owner,158550
3,Skoda,135014,Petrol,First Owner,279857
4,Toyota,78334,Petrol,Forth & Above Owner,210982


In [114]:
df['Brand'].value_counts()

Brand
Toyota        369
BMW           353
Kia           350
Mahindra      349
Honda         342
Skoda         337
Hyundai       336
Audi          335
Volkswagen    329
Fiat          328
Volvo         326
Jeep          322
Jaguar        309
Mercedes      309
TATA          306
Name: count, dtype: int64

In [115]:
df['Fuel'].value_counts()

Fuel
Diesel    1318
Petrol    1257
CNG       1234
LPG       1191
Name: count, dtype: int64

In [116]:
df['Owner'].value_counts()

Owner
First Owner            1270
Forth & Above Owner    1261
Third Owner            1254
Second Owner           1215
Name: count, dtype: int64

In [117]:
df_OHE = pd.get_dummies(df, columns=['Fuel', 'Owner'], drop_first=True)

In [118]:
df_OHE

Unnamed: 0,Brand,Km_Driven,Selling_Price,Fuel_Diesel,Fuel_LPG,Fuel_Petrol,Owner_Forth & Above Owner,Owner_Second Owner,Owner_Third Owner
0,Hyundai,31646,358505,False,False,False,False,False,True
1,Honda,41754,104979,True,False,False,True,False,False
2,Hyundai,6316,158550,False,True,False,True,False,False
3,Skoda,135014,279857,False,False,True,False,False,False
4,Toyota,78334,210982,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...
4995,Toyota,80997,258047,False,False,False,False,False,False
4996,BMW,116011,372734,True,False,False,False,False,True
4997,Mahindra,41230,382648,False,False,True,False,False,False
4998,Volvo,56432,367610,False,False,True,False,False,False


In [119]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=42)

In [120]:
x_train.head()

Unnamed: 0,Brand,Km_Driven,Fuel,Owner
4227,Jeep,197591,LPG,Forth & Above Owner
4676,Honda,183546,CNG,Third Owner
800,Volkswagen,30015,LPG,Forth & Above Owner
3671,Volvo,99303,CNG,Second Owner
4193,Kia,97559,CNG,First Owner


In [121]:
from sklearn.preprocessing import OneHotEncoder

In [122]:
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

In [123]:
x_train_new = ohe.fit_transform(x_train[['Fuel','Owner']])

In [124]:
x_test_new = ohe.transform(x_test[['Fuel','Owner']])

In [125]:
x_test_new.shape

(1000, 6)

In [126]:
np.hstack((x_train[['Brand','Km_Driven']].values,x_train_new)).shape

(4000, 8)

In [127]:
#OneHotEncoding with Top Categories

In [131]:
count = df['Brand'].value_counts()

In [132]:
df['Brand'].nunique()
threshold=100

In [133]:
repl = count[count <= threshold].index

In [142]:
pd.get_dummies(df['Brand'].replace(repl,'uncommon')).sample(5)

Unnamed: 0,Audi,BMW,Fiat,Honda,Hyundai,Jaguar,Jeep,Kia,Mahindra,Mercedes,Skoda,TATA,Toyota,Volkswagen,Volvo
2860,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
1900,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
4457,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2910,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
3933,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [143]:
df_one_hott = pd.get_dummies(df['Brand'])

In [144]:
df_one_hott.sum()

Audi          335
BMW           353
Fiat          328
Honda         342
Hyundai       336
Jaguar        309
Jeep          322
Kia           350
Mahindra      349
Mercedes      309
Skoda         337
TATA          306
Toyota        369
Volkswagen    329
Volvo         326
dtype: int64