In [58]:
# importing the libraries
import numpy as np
import pandas as pd

In [59]:
# loading the dataset
dataset = pd.read_csv('/content/cars.csv')

In [60]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [61]:
dataset.sample(15)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
6729,Maruti,56000,Diesel,Second Owner,425000
1125,Maruti,5621,Petrol,First Owner,650000
1366,Tata,19723,Petrol,First Owner,525000
7995,Maruti,80000,Petrol,Third Owner,170000
5045,Ford,50000,Diesel,First Owner,600000
7139,Honda,40000,Petrol,First Owner,750000
1020,Tata,40000,Diesel,First Owner,950000
5745,Chevrolet,90000,Petrol,Second Owner,125000
5443,Fiat,60000,Diesel,Second Owner,459999
1002,Maruti,57000,Diesel,Second Owner,950000


In [62]:
for col in dataset.columns:
    if dataset[col].dtype == 'object':
        print(f"Value counts for {col}:")
        print(dataset[col].value_counts())
        print("-" * 20)

Value counts for brand:
brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64
--------------------
Value counts for fuel:
fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64
--------------------
Value counts for owner:
owner
First Owner             5289
Second Owner            2

**1. OneHotEncoding using Pandas**

In [64]:
pd.get_dummies(dataset, columns =['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


**2. OneHotEncoding ( k - 1 ) using Panda**s

In [65]:
pd.get_dummies(dataset,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


**3. OneHotEncoding using Sklearn**

In [66]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [68]:
X = dataset.iloc[:,0:4]
y = dataset.iloc[:,-1]

In [67]:
# importing train_test_split
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)

In [70]:
X_train.shape

(6502, 4)

In [72]:
# importing ordinal encoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(
    drop='first',
    sparse_output=False,
    dtype=np.int32,
    handle_unknown='ignore'
)

In [73]:
ohe

In [74]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [75]:
X_train_new

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int32)

In [78]:
X_train.shape

(6502, 4)

In [76]:
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [77]:
X_test_new

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int32)

In [79]:
X_test_new.shape

(1626, 7)

In [80]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Tata', 2560, 0, ..., 0, 0, 0],
       ['Honda', 80000, 0, ..., 1, 0, 0],
       ['Hyundai', 150000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], dtype=object)

**4. OneHotEncoding with Top Categories**

In [83]:
counts = dataset['brand'].value_counts()

In [84]:
dataset['brand'].nunique()
threshold = 100

In [85]:
repl = counts[counts <= threshold].index

In [86]:
pd.get_dummies(dataset['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
3378,False,False,False,False,False,False,False,False,False,False,False,False,True
5738,False,False,False,False,False,False,True,False,False,False,False,False,False
4681,False,False,False,False,False,True,False,False,False,False,False,False,False
6337,True,False,False,False,False,False,False,False,False,False,False,False,False
2760,False,False,False,False,False,False,True,False,False,False,False,False,False
