In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')
df.sample(3)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
7360,Maruti,58559,Diesel,First Owner,693000
854,Maruti,43000,Diesel,First Owner,625000
313,Maruti,90000,Petrol,Third Owner,535000


In [3]:
df.shape

(8128, 5)

In [4]:
df.describe()

Unnamed: 0,km_driven,selling_price
count,8128.0,8128.0
mean,69819.51,638271.8
std,56550.55,806253.4
min,1.0,29999.0
25%,35000.0,254999.0
50%,60000.0,450000.0
75%,98000.0,675000.0
max,2360457.0,10000000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [6]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [7]:
df['brand'].nunique()    #no of brands

32

In [8]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [9]:
df['fuel'].nunique()

4

In [10]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

### One-Hot Encoding using pandas

We use pandas OHE technique which is <b style ='color:orange'>get_dummies()</b> method while data analysis. But for End to End ML project it is wise to use <b style = 'color:orange'>Sklearn's OneHotEncoder.</b>
<br>

- get_dummies() can’t handle the <b style = 'color:orange'>unknown category</b> during the transformation natively. You have to apply some techniques to handle it. But it is not efficient. On the other hand, OneHotEncoder will natively handle unknown categories. All you need to do is set the parameter <b style = 'color:orange'>handle_unknown='ignore'</b> to OneHotEncoder.

- Suppose you have a unknown kono category in your test set which is not in your training set. In this case, that category will be <b style = 'color:orange'>inconsistent</b> with train data and will eventually fail during the model building process.

- get_dummies() method doesn’t <b style = 'color:orange'>store</b> the information about train data categories. Hence it may result in inconsistencies with train and test data features. 

In [11]:
x = pd.get_dummies(df,columns=['fuel','owner'])   #x has 12 columns in total.
x

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### k-1 One-Hot Encoding

When we use OHE, we basically create <b style = 'color:orange'>dummy columns / features</b> and as a result the <b style = 'color:orange'>Multicolinearity</b> problem arises. And this known as <b style = 'color:red'>Dummy Variable Trap"</b>. To avoid this, we drop <b style = 'color:green'>one</b> dummy column for each nominal feature during One-Hot Encoding.

In [12]:
"""
if there are k categories then there will be k-1 columns for a certain feature/column.
"""
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)   #10 ta notun column hobe

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### One-Hot Encoding using sklearn 

- OneHotEncoder object <b style = 'color:orange'>stores</b> the information about categories from the training dataset. So, whenever it encounters any unknown categories during transformation on test set, it will <b style = 'color:orange'>ignore</b> them and the number of features will remain the same as the training data. 

 
- If you want to put your machine learning model into production, <b style = 'color:orange'>Scikit-learn Pipeline</b> will be very useful. But, get_dummies is not <b style = 'color:orange'>compatible</b> with the Scikit-learn pipeline. It requires you to create your own transformer. On the other hand, OneHotEncoder is compatible with the Scikit-learn pipeline. 

- <b style = 'color:orange'>handle_unknown</b> — this is an important parameter. If <b style = 'color:red'>‘raise’</b>, when an unknown categorical feature is present it will raise the error. If <b style = 'color:red'>‘ignore’</b>, a new feature will be created with all values of zero.

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size = 0.2,random_state = 2)

In [14]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


### Append After Doing Encoding Manually.

When you use OneHotEncoder(), you first transform the column and then you append it to the original data. But when we use <b style = 'color:orange'>columntransformer</b> the whole process can be done in a single step. But for learning purposes, we will see how to manually handle this type of appending after the transformation.

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
ohe = OneHotEncoder(drop='first')   #Dropping the first dummy column to avoid multicolinearity problem 

In [17]:
ohe.fit_transform(X_train[['fuel','owner']])

<6502x7 sparse matrix of type '<class 'numpy.float64'>'
	with 8746 stored elements in Compressed Sparse Row format>

In [18]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_train_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [19]:
X_train_new.shape

(6502, 7)

In [20]:
X_test_new = ohe.transform(X_train[['fuel','owner']]).toarray()
X_test_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [21]:
X_train[['brand','km_driven']].values

array([['Hyundai', 35000],
       ['Jeep', 60000],
       ['Hyundai', 25000],
       ...,
       ['Tata', 15000],
       ['Maruti', 32500],
       ['Isuzu', 121000]], dtype=object)

In [22]:
"""
Same as hstack()
"""
#np.concatenate((X_train[['brand','km_driven']].values,X_train_new),axis = 1)
#np.concatenate((X_train[['brand','km_driven']].values,X_train_new),axis = 1).shape
#type(np.concatenate((X_train[['brand','km_driven']].values,X_train_new),axis = 1))

'\nSame as hstack()\n'

In [23]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Jeep', 60000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 25000, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Tata', 15000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 32500, 1.0, ..., 1.0, 0.0, 0.0],
       ['Isuzu', 121000, 1.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [24]:
type(np.hstack((X_train[['brand','km_driven']].values,X_train_new)))

numpy.ndarray

In [25]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 9)

In [26]:
"""
When you set sparse as False, then you don't need to convert the result into a numpy array manually.
"""
ohe = OneHotEncoder(sparse=False,drop = 'first',dtype=np.int32)  

In [27]:
ohe.fit_transform(X_train[['fuel','owner']])

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [28]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.fit_transform(X_test[['fuel','owner']])
X_test_new

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [29]:
np.concatenate((X_train[['brand','km_driven']].values,X_train_new),axis = 1)

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

### One-Hot Encoding with top categories with Pandas 

In [30]:
counts = df['brand'].value_counts()
counts

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [31]:
df['brand'].nunique()

32

In [32]:
threshold = 100

"""
Storing all the names of the brands in a list that lies under threshold into a new variable.
"""

repl = counts[counts <= threshold].index
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [33]:
pd.get_dummies(df['brand'].replace(repl,'uncommon')).sample(10)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
4893,0,0,0,0,0,0,1,0,0,0,0,0,0
2541,0,0,0,0,0,1,0,0,0,0,0,0,0
994,0,0,0,0,0,1,0,0,0,0,0,0,0
5463,0,0,0,0,0,0,0,0,0,0,0,1,0
2815,0,0,0,0,0,1,0,0,0,0,0,0,0
912,0,0,0,0,0,0,0,0,0,1,0,0,0
5866,0,0,0,0,0,0,1,0,0,0,0,0,0
5430,0,0,0,0,0,1,0,0,0,0,0,0,0
5137,0,0,0,0,0,0,1,0,0,0,0,0,0
7196,0,0,0,0,1,0,0,0,0,0,0,0,0


In [34]:
df.iloc[[3240,4872],:]

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
3240,Land,21000,Diesel,First Owner,4500000
4872,Fiat,153000,Diesel,Second Owner,150000


In [35]:
df['brand'].replace(repl,'uncommon').sample(15)

6318          Tata
6115    Volkswagen
7488       Hyundai
302         Maruti
5185        Maruti
3897        Maruti
927           Tata
3616          Tata
1339       Hyundai
6411    Volkswagen
3841        Maruti
653         Maruti
7729        Maruti
4225        Maruti
6047          Ford
Name: brand, dtype: object

In [60]:
threshold = 100
count1 = df['brand'].value_counts()
repl1 = count1[count1 <= threshold].index
df['brand'].replace(repl1,'uncommon').value_counts()

Maruti        2448
Hyundai       1415
Mahindra       772
Tata           734
uncommon       538
Toyota         488
Honda          467
Ford           397
Chevrolet      230
Renault        228
Volkswagen     186
BMW            120
Skoda          105
Name: brand, dtype: int64

In [61]:
ohe1 = OneHotEncoder(sparse=False,drop = 'first')
X_train_new = ohe1.fit_transform(X_train[['brand','fuel']])
X_train_new.shape

(6502, 33)

In [64]:
print(X_train['brand'].nunique())
print(df['brand'].nunique())

31
32
