In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('cars.csv')

In [5]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [8]:
df['brand'].nunique()

32

In [9]:
df['fuel'].value_counts

<bound method IndexOpsMixin.value_counts of 0       Diesel
1       Diesel
2       Petrol
3       Diesel
4       Petrol
         ...  
8123    Petrol
8124    Diesel
8125    Diesel
8126    Diesel
8127    Diesel
Name: fuel, Length: 8128, dtype: object>

# One Hot Encoding using Pandas

In [13]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# Droping first column to solve colinearity problems

In [12]:
pd.get_dummies(df,columns=['fuel','owner'], drop_first= True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


# **OneHotEncoding using Sklearn**

In [15]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [16]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2, random_state=None)

In [17]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
1227,Mahindra,120000,Diesel,Second Owner
5877,Maruti,70000,Petrol,First Owner
134,Jeep,17000,Petrol,First Owner
268,Hyundai,2000,Petrol,First Owner
3176,Hyundai,39000,Diesel,First Owner


In [19]:
x_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
256,Maruti,75000,Diesel,First Owner
2303,Volkswagen,70000,Diesel,First Owner
4758,Toyota,27000,Diesel,First Owner
2560,Maruti,155000,Petrol,Second Owner
4775,Maruti,90000,Petrol,Second Owner


In [74]:
ohe = OneHotEncoder(drop = 'first', dtype=np.int32)

# Trasnforming and fiting in one go
**We used toarray beacuse by default it OneHotEncoder will generate a sparse matrix and to convert it back to a numpy array we toarray function**

In [75]:
x_train_new = ohe.fit_transform(x_train[['fuel','owner']]).toarray()

**To directly convert the tranformation into a numpy array do 'sparse_output = False' as it is set to TRUE by default**

change this in object class itself

In [71]:
ohe2 = OneHotEncoder(drop='first', sparse_output= False)

In [72]:
x_train_new2 = ohe2.fit_transform(x_train[['fuel','owner']])

In [73]:
x_train_new2

array([[1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

# With the first way to without using sparse_output

In [76]:
x_train_new.shape

(6502, 7)

In [77]:
x_test_new = ohe.fit_transform(x_test[['fuel','owner']]).toarray()

In [78]:
x_test_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int32)

Now adding **x_train_new** to our original **x_train** array,

In [79]:
x_train_new.shape

(6502, 7)

**Original x_train**

In [80]:
x_train[['brand', 'km_driven']].values

array([['Mahindra', 120000],
       ['Maruti', 70000],
       ['Jeep', 17000],
       ...,
       ['Land', 64788],
       ['Mahindra', 180000],
       ['Maruti', 90000]], dtype=object)

# **Stacking x_train**
**Here np.hstack:-** stack numpy array together using tuples, that's why we use '(())' to pass the tuples
that need to be stacked together

In [101]:
new = np.hstack((x_train[['brand', 'km_driven']],x_train_new))

In [103]:
new = pd.DataFrame(new)
new

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Mahindra,120000,1,0,0,0,1,0,0
1,Maruti,70000,0,0,1,0,0,0,0
2,Jeep,17000,0,0,1,0,0,0,0
3,Hyundai,2000,0,0,1,0,0,0,0
4,Hyundai,39000,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6497,Maruti,10000,0,0,1,0,0,0,0
6498,Hyundai,61000,1,0,0,0,1,0,0
6499,Land,64788,1,0,0,0,0,0,0
6500,Mahindra,180000,1,0,0,0,0,0,0


# **OneHotEncoding with Top categories**

In [84]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

**Clubing least frequent cataegoies into others that is brands with cars less than 100**

In [85]:
counts = df['brand'].value_counts()

In [90]:
df['brand'].nunique()
threshold = 100

In [92]:
repl = counts[counts <= threshold].index

In [100]:
pd.get_dummies(df['brand'].replace(repl,'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5077,0,0,0,0,0,0,0,0,0,0,0,0,1
4987,0,0,0,0,0,0,1,0,0,0,0,0,0
4862,0,0,0,1,0,0,0,0,0,0,0,0,0
3361,0,0,0,0,0,0,0,1,0,0,0,0,0
5109,0,0,0,0,0,0,1,0,0,0,0,0,0
