# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# Upload dataset
### Taken this dataset from campusX git hub or from campus X youtube channel

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [5]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [6]:
df['brand'].nunique()

32

In [7]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [9]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

#### I am going to apply one hot encoding for fuels and owners. For brands I am going to categorize them into most frequently and club all less frequently into one group. By doing this the number of categories in brand (32 now) will be reduced which helps in quick processing of the data

#### From above description I found we have 5 types of owners and 4 types of fuels. The one hot encoding comes with N-1 rules. Now the strings will be converted into array whic has 0's and 1's. And one of the column will be removed inorder to avoid multi collinearity

# One Hot Encoding

#### For converting string into array we need to get dummy variable so the below code is used to get dummies for 2 variables which is fuel and owner. There were totally 5 columns in our dataset. now it became 12 because we already know the fuel had 4 categories and the owner had 5 categories. The fuel and owner got splitted into 4 and 5 columns which is 9 in total and we already had 3 columns so totally 12 columns. 

In [13]:
pd.get_dummies(df,columns=['fuel','owner'],dtype = int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# N-1 One Hot Encoding
#### Now we converted the strings into array of 0 and 1. At this moment we need to apply N-1 concept to avoid redundancy and multicollinearity problem

In [14]:
pd.get_dummies(df,columns=['fuel','owner'],dtype = int,drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


#### The N-1 rule is applied using drop_first=True. After applying this the first column of fuel_CNG is removed and first column of owner_first_owner is also removed.

#### The one hot encoding is done using pandas but generally in machine learning projects we don't use pandas library for one hot encoding because the pandas cannot remember the values it once given when we re run the program it may give different values from the previous one. So, in practice we prefer sklearn library

# One Hot Encoding using Sklearn

In [20]:
# Imported the libraries and splitted the X and y. Taken 4 columns in X and 1 column in y
from sklearn.model_selection import train_test_split
X = df[['brand','km_driven','fuel', 'owner']]
y = df['selling_price']

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [45]:
# To perfom OHE in Sklearn, we need the below libraries.
from sklearn.preprocessing import OneHotEncoder
#Inside the OneHot Encoder we have given drop='first', this means the first values need to be dropped
# If we don't include sparse or if we give sparse =true then there will be dimensionality problem because the sklearn stores data in scipy format not in dataframe, so its worth giving the condition
# And finally dtype =int helps to keep all the numpy values in integer rather than float
OHE = OneHotEncoder(drop='first',sparse_output=False, dtype=int)

In [46]:
# Now OHE is called and fitted to fuel and owner for X training set
X_train_OHE = OHE.fit_transform(X_train[['fuel','owner']])

In [47]:
# Similarly OHE is called and fitted to fuel and owner for X testing set
X_test_OHE = OHE.fit_transform(X_test[['fuel','owner']])

In [48]:
# we can see the column is reduced to 7 (because we used drop = 'first') before it use to be 9 (4 in fuels and 5 in owners)
X_train_OHE.shape

(6502, 7)

In [49]:
X_test_OHE.shape

(1626, 7)

In [50]:
# Now the other two features of X is stacked with fuel and owner.
np.hstack((X_train[['brand','km_driven']].values,X_train_OHE))

array([['Tata', 2560, 0, ..., 0, 0, 0],
       ['Honda', 80000, 0, ..., 1, 0, 0],
       ['Hyundai', 150000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], shape=(6502, 9), dtype=object)

# One Hot Encoding for Brands 
#### (has 32 categories which needs to be reduced by using one hot encoder)

In [51]:
counts = df['brand'].value_counts()

In [52]:
# We have taken threshold of 100 as one (which has 12 categories) and remaining as one categories which is named as un. SO now the 32 varieties were reduced to 13. 
df['brand'].nunique()
threshold = 100

In [53]:
replace = counts[counts <= threshold].index

In [55]:
pd.get_dummies(df['brand'].replace(replace, 'Others'),dtype=int).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
1865,0,0,0,1,0,0,0,0,0,0,0,0,0
5442,0,0,1,0,0,0,0,0,0,0,0,0,0
1458,0,0,0,0,1,0,0,0,0,0,0,0,0
1593,0,0,0,0,0,1,0,0,0,0,0,0,0
2819,0,0,1,0,0,0,0,0,0,0,0,0,0
