In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("customer.csv")

In [3]:
df = df.iloc[:,2:]
df.sample(5)

Unnamed: 0,review,education,purchased
29,Average,UG,Yes
1,Poor,UG,No
32,Average,UG,Yes
21,Average,PG,No
31,Poor,School,Yes


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:2], df.iloc[:,2], test_size=0.2)

In [5]:
x_train.head()

Unnamed: 0,review,education
46,Poor,PG
31,Poor,School
26,Poor,PG
49,Good,UG
29,Average,UG


In [6]:
y_train.head()

46     No
31    Yes
26     No
49     No
29    Yes
Name: purchased, dtype: object

In [7]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories = [['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])
oe.fit(x_train)
x_train_encoded = oe.transform(x_train)
x_test_encoded = oe.transform(x_test)

In [8]:
x_train_encoded

array([[0., 2.],
       [0., 0.],
       [0., 2.],
       [2., 1.],
       [1., 1.],
       [0., 0.],
       [2., 1.],
       [1., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 1.],
       [2., 2.],
       [0., 1.],
       [1., 2.],
       [1., 1.],
       [0., 2.],
       [1., 0.],
       [0., 0.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [0., 2.],
       [2., 2.],
       [1., 2.],
       [1., 1.],
       [0., 2.],
       [0., 0.],
       [2., 1.],
       [2., 2.],
       [0., 1.],
       [2., 0.],
       [1., 2.],
       [0., 0.],
       [1., 0.],
       [2., 0.],
       [2., 0.],
       [2., 0.],
       [2., 2.],
       [0., 2.]])

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1])

In [10]:
df_cars = pd.read_csv('cars.csv')
df_cars.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
7324,Audi,22000,Diesel,First Owner,2825000
157,Maruti,46100,Petrol,First Owner,599000
3028,Hyundai,23511,Petrol,First Owner,625000
1951,Hyundai,160000,Diesel,First Owner,375000
5676,Maruti,70000,Petrol,First Owner,100000


In [11]:
df_cars['brand'].nunique()

32

**One-Hot Encoding using pandas**

In [13]:
pd.get_dummies(df_cars, columns=['fuel', 'owner']).sample(5)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
314,Nissan,80100,555000,False,True,False,False,False,False,True,False,False
1049,Renault,22000,250000,False,False,False,True,True,False,False,False,False
372,Jaguar,45000,3200000,False,True,False,False,True,False,False,False,False
3843,Maruti,69779,600000,False,False,False,True,True,False,False,False,False
3806,Hyundai,7000,567000,False,False,False,True,True,False,False,False,False


**K-1 One-Hot Encoding using pandas**

In [15]:
pd.get_dummies(df_cars, columns=['fuel', 'owner'], drop_first = True).sample(5)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
7988,Maruti,32000,1025000,True,False,False,False,False,False,False
6606,Honda,25000,675000,False,False,True,False,True,False,False
1287,Chevrolet,75000,140000,False,False,True,False,False,False,False
6224,Maruti,80000,650000,True,False,False,False,False,False,False
3770,Skoda,125000,650000,True,False,False,False,True,False,False


In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_cars.iloc[:,:4], df_cars.iloc[:,-1], test_size=0.2, random_state = 0)
x_test.shape

(1626, 4)

**One Hot Encoding using Scikit-Learn**

In [37]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
x_train_new = ohe.fit_transform(x_train[['fuel', 'owner']])
x_test_new = ohe.transform(x_test[['fuel', 'owner']])
x_train_new.shape

(6502, 7)

In [39]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner


In [57]:
np.hstack((x_train[['brand', 'km_driven']].values, x_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

**OneHotEncoding with Top Categories**

In [78]:
counts = df_cars['brand'].value_counts()
threshold = 100;
rep = counts[counts<threshold].index
pd.get_dummies(df_cars['brand'].replace(rep, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
4367,False,False,False,False,False,False,True,False,False,False,False,False,False
1992,False,False,False,False,False,False,True,False,False,False,False,False,False
4082,False,False,False,False,True,False,False,False,False,False,False,False,False
6184,False,False,False,False,False,False,False,False,False,False,False,False,True
600,False,False,False,False,False,False,False,False,False,False,True,False,False
