In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('customer.csv')
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


## Ordinal Encoding

In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:,['review', 'education']], df.iloc[:,-1], test_size=0.2)

In [10]:
X_train.head()

Unnamed: 0,review,education
17,Poor,UG
44,Average,UG
39,Poor,PG
6,Good,School
25,Good,School


In [11]:
# specify order
oe = OrdinalEncoder(
    categories=[['Poor','Average','Good'],['School','UG','PG']]
) 
# You have to mention the categories in the order of lower importance to higher otherwise encoder will automatically assign new categories as per they come into picture.

In [12]:
X_train = oe.fit_transform(X_train)
X_test = oe.transform(X_test) # Only transform because of data leakage problem

X_train[:5]

array([[0., 1.],
       [1., 1.],
       [0., 2.],
       [2., 0.],
       [2., 0.]])

In [17]:
# Attributes
print(oe.categories_)
print(oe.feature_names_in_)
print(oe.n_features_in_)

oe.get_feature_names_out()

[array(['Poor', 'Average', 'Good'], dtype=object), array(['School', 'UG', 'PG'], dtype=object)]
['review' 'education']
2


array(['review', 'education'], dtype=object)

In [20]:
# Inverse Encoding
oe.inverse_transform(np.array([[0,2]])) # The passing array must be in 2D

array([['Poor', 'PG']], dtype=object)

In [22]:
# Handling unknown value
oe = OrdinalEncoder(
    categories=[['Poor','Average','Good'],['School','UG','PG']],
    handle_unknown='use_encoded_value', # The value is provided with unknown_value parameter - This will be applied for all the columns that have been encoded through this single encoding object
    unknown_value=-1 # Encoding value
)
'''
The handle_unknown parameter determines how the encoder should handle categories that were not seen during the training phase (i.e., new categories that appear during transformation). There are two possible values:

- 'error' (default): Raises an error when an unknown category is encountered during transformation.

- 'use_encoded_value': When this option is used, you must also specify an unknown_value parameter. Any unknown categories encountered during transformation will be encoded with this specified value. In your code example, unknown categories will be encoded as -1.
'''

X_train, X_test, y_train, y_test = train_test_split(df.loc[:,['review', 'education']], df.iloc[:,-1], test_size=0.2)
X_train = oe.fit_transform(X_train)

# Create a DataFrame with proper column names to avoid UserWarning
test_data = pd.DataFrame([['Poor', 'college']], columns=['review', 'education'])
oe.transform(test_data)

array([[ 0., -1.]])

In [24]:
# handling infrequent/rare categories
X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +['snake'] * 3 + ['horse'] * 2], dtype=object).T
np.random.permutation(X)[:5, :]

array([['cat'],
       ['cat'],
       ['horse'],
       ['rabbit'],
       ['rabbit']], dtype=object)

In [27]:
pd.Series(X.ravel()).value_counts()

cat       20
rabbit    10
dog        5
snake      3
horse      2
Name: count, dtype: int64

In [32]:
# By using max_categories parameter
enc = OrdinalEncoder(max_categories=4).fit(X)
# The lowest n categories are considered as rare categories.

In [34]:
enc.infrequent_categories_

[array(['horse', 'snake'], dtype=object)]

In [37]:
enc.transform(np.array([['cat','rabbit','snake','dog','horse']]).reshape(5,1))

array([[0.],
       [2.],
       [3.],
       [1.],
       [3.]])

In [38]:
# By using min_frequency parameter
enc = OrdinalEncoder(min_frequency=4).fit(X) # By using frequency threshold
enc.infrequent_categories_

[array(['horse', 'snake'], dtype=object)]

In [39]:
enc.transform(np.array([['cat','rabbit','snake','dog','horse']]).reshape(5,1))

array([[0.],
       [2.],
       [3.],
       [1.],
       [3.]])

In [40]:
# handling missing data
data = [['Cat'], [np.nan], ['Dog'], ['Fish'], [np.nan]]
encoder = OrdinalEncoder(encoded_missing_value=-1)
encoded_data = encoder.fit_transform(data)

print(encoded_data)

[[ 0.]
 [-1.]
 [ 1.]
 [ 2.]
 [-1.]]


---

## Label Encoding

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:3], df.iloc[:,-1], test_size=0.2)

In [44]:
from sklearn.preprocessing import LabelEncoder

In [45]:
le = LabelEncoder() # There is no hyperparameters
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [46]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [48]:
le.inverse_transform(np.array([1,1,0]))

array(['Yes', 'Yes', 'No'], dtype=object)

---

## One Hot Encoding

In [49]:
cars = pd.read_csv('cars.csv')
cars.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [53]:
X = cars.iloc[:, [0, 2]]
y = cars.iloc[:, -1]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [54]:
X_train['fuel'].nunique()

4

In [55]:
from sklearn.preprocessing import OneHotEncoder

In [58]:
ohe = OneHotEncoder(sparse_output = False, dtype = np.int32) # All the columns get be encoded seperately by single object
# sparse_output = False - Directly returns the encoded matrix
ohe.fit_transform(X_train)

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], shape=(6502, 36), dtype=int32)

In [59]:
ohe.categories_

[array(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
        'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
        'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
        'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
        'Peugeot', 'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen',
        'Volvo'], dtype=object),
 array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]

In [60]:
ohe.feature_names_in_

array(['brand', 'fuel'], dtype=object)

In [61]:
ohe.n_features_in_

2

In [62]:
ohe.get_feature_names_out()

array(['brand_Ambassador', 'brand_Ashok', 'brand_Audi', 'brand_BMW',
       'brand_Chevrolet', 'brand_Daewoo', 'brand_Datsun', 'brand_Fiat',
       'brand_Force', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Isuzu', 'brand_Jaguar', 'brand_Jeep', 'brand_Kia',
       'brand_Land', 'brand_Lexus', 'brand_MG', 'brand_Mahindra',
       'brand_Maruti', 'brand_Mercedes-Benz', 'brand_Mitsubishi',
       'brand_Nissan', 'brand_Opel', 'brand_Peugeot', 'brand_Renault',
       'brand_Skoda', 'brand_Tata', 'brand_Toyota', 'brand_Volkswagen',
       'brand_Volvo', 'fuel_CNG', 'fuel_Diesel', 'fuel_LPG',
       'fuel_Petrol'], dtype=object)

In [67]:
ohe.inverse_transform(np.array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]).reshape(1,36))

array([['Audi', 'LPG']], dtype=object)

## Dummy Variable Trap
When you have a categorical variable with n categories, you only need n-1 binary columns to represent it completely. This is because:
- If you have n categories and create n binary columns
- `The last column can always be perfectly predicted from the other n-1 columns`
- This creates `perfect multicollinearity`, which can cause problems in statistical models

Why First Column?
- The first column is typically dropped by convention
- It could be any column, but dropping the first one is a common practice
- The dropped category becomes the "reference" or "baseline" category
- All other categories are interpreted relative to this baseline

In [68]:
# Dummy variable trap
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ohe = OneHotEncoder(drop='first', sparse_output=False)
ohe.fit_transform(X_train).shape

(6502, 34)

In [None]:
ohe.drop_idx_ # 0th category from 1st column and 0th category from nd column

array([0, 0], dtype=object)

In [69]:
# handling rare categories
X_train['brand'].value_counts()

brand
Maruti           1953
Hyundai          1127
Mahindra          635
Tata              586
Toyota            391
Honda             369
Ford              320
Chevrolet         185
Renault           183
Volkswagen        154
BMW                96
Skoda              82
Nissan             62
Jaguar             59
Volvo              54
Datsun             48
Mercedes-Benz      43
Fiat               35
Audi               30
Jeep               26
Lexus              22
Mitsubishi         13
Force               6
Land                5
Kia                 4
Daewoo              3
MG                  3
Ambassador          3
Isuzu               2
Ashok               1
Peugeot             1
Opel                1
Name: count, dtype: int64

In [70]:
cars['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [71]:
# using min frequency
ohe = OneHotEncoder(sparse_output=False, min_frequency=100)
ohe.fit_transform(X_train).shape

(6502, 14)

In [72]:
ohe.get_feature_names_out()

array(['brand_Chevrolet', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Mahindra', 'brand_Maruti', 'brand_Renault', 'brand_Tata',
       'brand_Toyota', 'brand_Volkswagen', 'brand_infrequent_sklearn',
       'fuel_Diesel', 'fuel_Petrol', 'fuel_infrequent_sklearn'],
      dtype=object)

In [73]:
# using max_categories
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', max_categories=15)
ohe.fit_transform(X_train).shape

(6502, 19)

In [74]:
ohe.get_feature_names_out()

array(['brand_BMW', 'brand_Chevrolet', 'brand_Ford', 'brand_Honda',
       'brand_Hyundai', 'brand_Jaguar', 'brand_Mahindra', 'brand_Maruti',
       'brand_Nissan', 'brand_Renault', 'brand_Skoda', 'brand_Tata',
       'brand_Toyota', 'brand_Volkswagen', 'brand_infrequent_sklearn',
       'fuel_CNG', 'fuel_Diesel', 'fuel_LPG', 'fuel_Petrol'], dtype=object)

In [78]:
# Handling unknowk category
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # If the category is not present in trained data then all the digits will be encoded by 0
ohe.fit_transform(X_train)

ohe.transform(pd.DataFrame([['local','Petrol']], columns = ['brand', 'fuel']))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.]])

In [79]:
ohe.inverse_transform(np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.]).reshape(1,36))

array([[None, 'Petrol']], dtype=object)

---

## LabelBinarizer
This type of encoding is One Hot Encoding of target column.

In [80]:
from sklearn.preprocessing import LabelBinarizer

# Sample target variable for a multi-class classification problem
y = ['cat', 'dog', 'fish', 'dog', 'cat']

# Initialize the LabelBinarizer
lb = LabelBinarizer()

# Fit and transform the target variable
y_binarized = lb.fit_transform(y)

print("Binarized labels:\n", y_binarized)

# Inverse transform to recover original labels
y_original = lb.inverse_transform(y_binarized)

print("Original labels:\n", y_original)

Binarized labels:
 [[1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [1 0 0]]
Original labels:
 ['cat' 'dog' 'fish' 'dog' 'cat']


In [82]:
'''
Parameters
These parameters control how the binary labels are encoded when you have a binary classification problem (only two classes). Here's what they do:
neg_label (default = 0):

This is the value assigned to the negative class (the first class encountered)
By default, it's set to 0
pos_label (default = 1):

This is the value assigned to the positive class (the second class encountered)
By default, it's set to 1
'''

# Binary classification example
y = ['cat', 'dog', 'cat', 'dog', 'cat']

# Default behavior (neg_label=0, pos_label=1)
lb_default = LabelBinarizer() # For 2 classes the first class will automatically get dropped
y_default = lb_default.fit_transform(y)
print("Default encoding:\n", y_default)
# Output: [[0], [1], [0], [1], [0]]

# Custom labels
lb_custom = LabelBinarizer(neg_label=-1, pos_label=2)
y_custom = lb_custom.fit_transform(y)
print("Custom encoding:\n", y_custom)
# Output: [[-1], [2], [-1], [2], [-1]]

Default encoding:
 [[0]
 [1]
 [0]
 [1]
 [0]]
Custom encoding:
 [[-1]
 [ 2]
 [-1]
 [ 2]
 [-1]]


- MultiLabelBinarizer is used for encoding multiple labels per instance
- It transforms multi-label data into a binary matrix format where each column represents a class
- Useful for scenarios where each sample can belong to multiple categories simultaneously

In [81]:
from sklearn.preprocessing import MultiLabelBinarizer

# Example multi-label data
y = [('red', 'blue'), ('blue', 'green'), ('green',), ('red',)]

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the data to binary matrix format
Y = mlb.fit_transform(y)

print("Binary matrix:\n", Y)
print("Class labels:", mlb.classes_)

# Inverse transform to recover original labels
y_inv = mlb.inverse_transform(Y)
print("Inverse transformed labels:", y_inv)

Binary matrix:
 [[1 0 1]
 [1 1 0]
 [0 1 0]
 [0 0 1]]
Class labels: ['blue' 'green' 'red']
Inverse transformed labels: [('blue', 'red'), ('blue', 'green'), ('green',), ('red',)]
