In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('cars.csv')

In [4]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [12]:
df['brand'].unique().size

32

In [13]:
df['fuel'].nunique()

4

# 1. Using pandas - OHE

In [15]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# N-1 OHE

In [16]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# using sklearn

In [20]:
x = df.iloc[:, 0:4]
y = df.iloc[:, -1]

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=404, test_size=0.1)

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [70]:
ohe = OneHotEncoder(drop='first')

In [71]:
ohe.fit(x_train[['fuel', 'owner']])

In [72]:
x_train_scaled = ohe.transform(x_train[['fuel', 'owner']]).toarray()  # 2 [[]] because it expects a DF 

In [73]:
x_test_scaled = ohe.fit_transform(x_test[['fuel', 'owner']]).toarray()  # converts sparse mat to regular np mat

In [74]:
x_test_scaled

array([[0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.]], shape=(813, 7))

In [75]:
x_train_scaled

array([[0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], shape=(7315, 7))

In [76]:
new_cols = pd.DataFrame(data=df, columns=['brand', 'km_driven', 'selling_price'])

In [77]:
# new_cols = new_cols.astype(np.array)    # it expects primitive dt (int, float, str)

In [78]:
new_arr = np.array(new_cols)

In [79]:
new_arr

array([['Maruti', 145500, 450000],
       ['Skoda', 120000, 370000],
       ['Honda', 140000, 158000],
       ...,
       ['Maruti', 120000, 382000],
       ['Tata', 25000, 290000],
       ['Tata', 25000, 290000]], shape=(8128, 3), dtype=object)

In [80]:
full_scaled = np.hstack((x_train[['brand', 'km_driven']].values, x_train_scaled))

In [81]:
full_scaled = pd.DataFrame(full_scaled)

In [82]:
full_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Maruti,35000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,BMW,28156,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Tata,90000,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Hyundai,83000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Maruti,43000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
7310,Maruti,40000,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7311,Ford,40000,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7312,Honda,30000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7313,Maruti,90000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
