In [None]:
##Practicing One Hot Encoding (For Nominal Categorical Data)

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [6]:
df = pd.read_csv('cars.csv')

In [7]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [11]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [13]:
df['brand'].nunique()

32

In [17]:
#Encoding One Hot Encoding by using Pandas Profile
pd.get_dummies(df,columns = ['fuel' ,'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [18]:
#For reducing the mathematical relations between column, we remove the first column from both ends
pd.get_dummies(df,columns = ['fuel' ,'owner'], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [19]:
#Split the train - test by using SKlearn

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,:-1],test_size = 0.2)

In [27]:
x_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
508,Tata,201850,Diesel,Second Owner
5124,Toyota,20000,Diesel,First Owner
7168,Tata,120000,Diesel,Second Owner
6496,Maruti,33000,Petrol,First Owner
6709,Tata,110000,Diesel,Second Owner


In [71]:
#Import One Hot Encoder from SKlearn
from sklearn.preprocessing import OneHotEncoder

In [70]:
#Create One Code Encoder into an Object
ohe = OneHotEncoder(drop = 'first', sparse_output = False, dtype = np.int32)
#drop means drop the first column for the mathematical error elimination
#sparse_output meansdon't create the data as array
#dtype means datatype = int means integer or otherwise it will be count as float 64

In [64]:
#Provide the encoder in which columns needed to be transform
x_train_new = ohe.fit_transform(x_train[['fuel','owner']])

In [65]:
x_test_new = ohe.fit(x_test[['fuel','owner']])

In [66]:
x_train_new.shape # Changes the shape

(6502, 7)

In [67]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new)) #Now added the residual columns from the main dataset (Brand,km_driven)

array([['Jeep', 50000, 1, ..., 0, 0, 0],
       ['Volvo', 2000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Skoda', 99000, 1, ..., 0, 0, 0],
       ['Maruti', 175000, 1, ..., 0, 0, 0],
       ['Hyundai', 80000, 1, ..., 1, 0, 0]], dtype=object)

In [68]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new)).shape #Check the shape (If the changes has made or not)

(6502, 9)

In [72]:
#Now working on the highest number of datas (where there are huge individual data)

In [75]:
#Here are numerous number of brands
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [76]:
counts = df['brand'].value_counts() #covert these values into an object

In [107]:
df['brand'].nunique()
threshold = 100 
#We consider those brands whose has 100 cars, count as individual brand, but less than 100(threshold), we count as uncommon

In [87]:
repl = counts[counts<= threshold].index

In [93]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [108]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'),dtype = np.int32) #Here we define, less than threshold number of car as uncommon

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


In [109]:
df_new = pd.get_dummies(df['brand'].replace(repl, 'uncommon'),dtype = np.int32, drop_first= True)

In [110]:
df_new.sample(5)

Unnamed: 0,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
2008,0,0,0,0,0,0,0,0,0,0,1,0
2173,0,0,0,0,0,0,0,0,1,0,0,0
2913,0,0,0,1,0,0,0,0,0,0,0,0
3235,0,0,1,0,0,0,0,0,0,0,0,0
502,0,0,0,0,0,1,0,0,0,0,0,0
