# Encoding Techniques

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day26-ordinal-encoding/customer.csv")
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [3]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
23,96,Female,Good,School,No
14,15,Male,Poor,PG,Yes
26,53,Female,Poor,PG,No
20,57,Female,Average,School,Yes
47,38,Female,Good,PG,Yes


In [4]:
df = df.iloc[:,2:]

In [5]:
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [6]:
from sklearn.preprocessing import OrdinalEncoder

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:2], df.iloc[:,-1], test_size=0.2)

In [8]:
X_train

Unnamed: 0,review,education
19,Poor,PG
42,Good,PG
26,Poor,PG
17,Poor,UG
32,Average,UG
49,Good,UG
37,Average,PG
8,Average,UG
12,Poor,School
1,Poor,UG


In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [10]:
oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [11]:
oe.fit(X_train)

OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [12]:
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)

In [13]:
X_train

array([[0., 2.],
       [2., 2.],
       [0., 2.],
       [0., 1.],
       [1., 1.],
       [2., 1.],
       [1., 2.],
       [1., 1.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [1., 1.],
       [0., 2.],
       [0., 0.],
       [1., 0.],
       [0., 2.],
       [2., 2.],
       [2., 0.],
       [0., 2.],
       [1., 1.],
       [2., 1.],
       [2., 1.],
       [2., 0.],
       [0., 1.],
       [0., 2.],
       [2., 0.],
       [2., 2.],
       [1., 0.],
       [1., 0.],
       [1., 2.],
       [2., 0.],
       [0., 2.],
       [2., 1.],
       [0., 0.],
       [0., 1.],
       [2., 2.],
       [2., 0.],
       [2., 2.],
       [0., 2.],
       [1., 1.]])

In [15]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
le = LabelEncoder()

In [18]:
le.fit(y_train)

LabelEncoder()

In [19]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [20]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [21]:
y_train

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0])

## One Hot Encoding | Handling Categorical Data

- OneHotEncoding
- Dummy variable Trap
- OHE Using Most frequent Variables

In [22]:
df =pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day27-one-hot-encoding/cars.csv")
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [23]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [24]:
df['brand'].nunique()

32

In [25]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [26]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [28]:
## OneHotEncoding using pandas

pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


In [29]:
# K-1 Encoding

pd.get_dummies(df, columns=['fuel', 'owner'], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


In [30]:
# OneHotEncoding

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.2, random_state=1)

In [31]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
1939,Honda,100000,Diesel,Second Owner
5455,Maruti,120000,Diesel,Second Owner
7079,Renault,100000,Diesel,Second Owner
2000,Honda,80000,Petrol,Second Owner
1578,Honda,56494,Petrol,First Owner
...,...,...,...,...
7935,Maruti,100000,Petrol,Third Owner
5192,Hyundai,152000,Diesel,First Owner
3980,Hyundai,70000,Petrol,First Owner
235,Maruti,110000,Diesel,Second Owner


In [32]:
X_test

Unnamed: 0,brand,km_driven,fuel,owner
1392,Mahindra,80000,Diesel,First Owner
7778,Tata,45000,Diesel,First Owner
3727,Hyundai,60000,Diesel,First Owner
6630,Tata,15000,Petrol,First Owner
103,Maruti,100000,Petrol,Third Owner
...,...,...,...,...
3697,Maruti,120000,Diesel,First Owner
3001,Hyundai,110000,Diesel,Third Owner
94,Maruti,55500,Petrol,First Owner
4535,Maruti,15000,Diesel,First Owner


In [34]:
from sklearn.preprocessing import OneHotEncoder

In [50]:
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)

In [51]:
ohe.fit_transform(X_train[['fuel','owner']])

array([[1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [52]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.fit_transform(X_test[['fuel','owner']])

In [57]:
X_train_new.shape

(6502, 7)

In [54]:
X_train[['brand', 'km_driven']].values

array([['Honda', 100000],
       ['Maruti', 120000],
       ['Renault', 100000],
       ...,
       ['Hyundai', 70000],
       ['Maruti', 110000],
       ['Maruti', 65755]], dtype=object)

In [55]:
np.hstack((X_train[['brand', 'km_driven']].values,X_train_new))

array([['Honda', 100000, 1, ..., 1, 0, 0],
       ['Maruti', 120000, 1, ..., 1, 0, 0],
       ['Renault', 100000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 70000, 0, ..., 0, 0, 0],
       ['Maruti', 110000, 1, ..., 1, 0, 0],
       ['Maruti', 65755, 0, ..., 0, 0, 0]], dtype=object)

In [56]:
np.hstack((X_train[['brand', 'km_driven']].values,X_train_new)).shape

(6502, 9)

In [58]:
# OneHotEncoding with Top Categories
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [59]:
counts = df['brand'].value_counts()

In [60]:
df['brand'].nunique()
threshold = 100

In [61]:
counts[counts <= threshold].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [63]:
repl = counts[counts <= threshold].index

In [64]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


In [67]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(20)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
306,0,0,0,0,0,0,1,0,0,0,0,0,0
2345,0,0,0,0,1,0,0,0,0,0,0,0,0
2112,0,0,0,0,0,0,1,0,0,0,0,0,0
3899,0,0,0,0,1,0,0,0,0,0,0,0,0
590,0,0,0,0,0,0,0,0,0,0,0,1,0
3503,0,0,0,0,1,0,0,0,0,0,0,0,0
6104,0,0,0,0,0,0,0,0,0,0,0,0,1
485,0,0,0,0,1,0,0,0,0,0,0,0,0
7318,0,1,0,0,0,0,0,0,0,0,0,0,0
1491,0,0,0,1,0,0,0,0,0,0,0,0,0


## Column Transformer in Machine Learning | How to use ColumnTransformer in Sklearn


In [68]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [69]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day28-column-transformer/covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [70]:
df['cough'].value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [71]:
df['city'].value_counts()

Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: city, dtype: int64

In [72]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [73]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(df.drop(columns=['has_covid']),
                                                  df['has_covid'],
                                                  test_size=0.2)

In [74]:
X_train

Unnamed: 0,age,gender,fever,cough,city
58,23,Male,98.0,Strong,Mumbai
2,42,Male,101.0,Mild,Delhi
87,47,Male,101.0,Strong,Bangalore
49,44,Male,104.0,Mild,Mumbai
26,19,Female,100.0,Mild,Kolkata
...,...,...,...,...,...
72,83,Female,101.0,Mild,Kolkata
53,83,Male,98.0,Mild,Delhi
50,19,Male,101.0,Mild,Delhi
25,23,Male,,Mild,Mumbai


In [79]:
# Aam Zindagi

# Adding simple Imputer to fever col
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.fit_transform(X_test[['fever']])

X_train_fever.shape

(80, 1)

In [81]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild', 'Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

#also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [84]:
# OneHotEncoder -> gender, city

ohe = OneHotEncoder(drop='first', sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender', 'city']])

#also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender', 'city']])

X_train_gender_city.shape


(80, 4)

In [86]:
# X_train_gender_city

In [87]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [88]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

### Mentos Zindagi

In [89]:
from sklearn.compose import ColumnTransformer

In [90]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [91]:
transformer.fit_transform(X_train).shape


(80, 7)

In [92]:
transformer.transform(X_test).shape

(20, 7)

In [None]:
https://machinelearningmastery.com/columntransformer-for-numerical-and-categorical-data/#:~:text=The%20ColumnTransformer%20is%20a%20class%20in%20the%20scikit-learn,sequence%20of%20transforms%20to%20just%20the%20categorical%20columns.

# Machine Learning Pipelines