In [71]:
import pandas as pd

data = {
    "age": [22, 25, 47, 52, 46, 56, 23, 34, 42, 29],
    
    "gender": [
        "Male", "Female", "Female", "Male", "Female",
        "Male", "Female", "Male", "Female", "Male"
    ],
    
    "review": [
        "Poor", "Good", "Excellent", "Fair", "Good",
        "Excellent", "Poor", "Fair", "Good", "Excellent"
    ],
    
    "education": [
        "High School", "Bachelor", "Master", "PhD", "Bachelor",
        "Master", "High School", "Bachelor", "PhD", "Master"
    ],
    
    "purchase": [
        "No", "Yes", "Yes", "No", "Yes",
        "Yes", "No", "No", "Yes", "Yes"
    ]
}

df = pd.DataFrame(data)

print(df)


   age  gender     review    education purchase
0   22    Male       Poor  High School       No
1   25  Female       Good     Bachelor      Yes
2   47  Female  Excellent       Master      Yes
3   52    Male       Fair          PhD       No
4   46  Female       Good     Bachelor      Yes
5   56    Male  Excellent       Master      Yes
6   23  Female       Poor  High School       No
7   34    Male       Fair     Bachelor       No
8   42  Female       Good          PhD      Yes
9   29    Male  Excellent       Master      Yes


In [72]:
df = df.iloc[:,2:]

In [73]:
df.head()

Unnamed: 0,review,education,purchase
0,Poor,High School,No
1,Good,Bachelor,Yes
2,Excellent,Master,Yes
3,Fair,PhD,No
4,Good,Bachelor,Yes


# 1Ô∏è‚É£ Ordinal Encoding

## Use it when:

üëâ The categorical feature has a natural order or ranking

### Example:

Education ‚Üí High School < Bachelor < Master < PhD
Review ‚Üí Poor < Fair < Good < Excellent
Size ‚Üí Small < Medium < Large
#### Here, the order matters.

You assign numbers based on ranking:
```sh
Poor ‚Üí 0
Fair ‚Üí 1
Good ‚Üí 2
Excellent ‚Üí 3
```

# 2Ô∏è‚É£ Label Encoding

## Use it when:

üëâ The column is the target variable (y)
OR
üëâ It‚Äôs a binary category (Yes/No)

### Example:

Purchase ‚Üí Yes / No
Spam ‚Üí Spam / Not Spam
Pass ‚Üí Fail

```sh
No ‚Üí 0
Yes ‚Üí 1
```
There is no ranking ‚Äî just classification labels.

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:, 0:2], df.iloc[:, -1], test_size=0.2)

In [76]:
from sklearn.preprocessing import OrdinalEncoder

In [77]:
oe = OrdinalEncoder(categories=[
    ['Poor', 'Fair', 'Good', 'Excellent'],
    ['High School', 'Bachelor', 'Master', 'PhD']
])


# Categories defines the order in which the values going to be ranked..
In above, the ranking would be like:
```sh
Poor ‚Üí 0
Fair ‚Üí 1
Good ‚Üí 2
Excellent ‚Üí 3
```

Similar for education feature.

In [78]:
oe.fit(X_train)

In [79]:
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)

In [82]:
X_train

array([[3., 2.],
       [2., 1.],
       [1., 3.],
       [2., 3.],
       [2., 1.],
       [3., 2.],
       [3., 2.],
       [1., 1.]])

In [83]:
print(oe.categories_)

[array(['Poor', 'Fair', 'Good', 'Excellent'], dtype=object), array(['High School', 'Bachelor', 'Master', 'PhD'], dtype=object)]


In [84]:
from sklearn.preprocessing import LabelEncoder

In [85]:
le = LabelEncoder()

In [86]:
le.fit(Y_train)

In [88]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [89]:
Y_train = le.transform(Y_train)

In [90]:
Y_test = le.transform(Y_test)

In [92]:
Y_test

array([0, 0])

# One Hot Encoding 
Used for nominal data.

In [151]:
import pandas as pd

In [152]:
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')

In [153]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [154]:
df['name'].value_counts()

name
Maruti Swift Dzire VDI                     69
Maruti Alto 800 LXI                        59
Maruti Alto LXi                            47
Maruti Alto LX                             35
Hyundai EON Era Plus                       35
                                           ..
Hyundai Verna Transform CRDi VGT SX ABS     1
Maruti S-Presso VXI Plus                    1
Toyota Etios Liva 1.2 VX                    1
Toyota Yaris G                              1
Hyundai i20 Magna 1.4 CRDi                  1
Name: count, Length: 1491, dtype: int64

In [155]:
df['name'].nunique()

1491

In [156]:
df['fuel'].nunique()

5

In [157]:
df['fuel'].value_counts()

fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [158]:
df['seller_type'].nunique

<bound method IndexOpsMixin.nunique of 0       Individual
1       Individual
2       Individual
3       Individual
4       Individual
           ...    
4335    Individual
4336    Individual
4337    Individual
4338    Individual
4339    Individual
Name: seller_type, Length: 4340, dtype: object>

In [159]:
df['owner'].nunique()

5

In [160]:
df['owner'].value_counts()

owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64

# 1. One Hot Encoding using panda

In [161]:
pd.get_dummies(df, columns=['fuel', 'owner']).head()

Unnamed: 0,name,year,selling_price,km_driven,seller_type,transmission,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC,2007,60000,70000,Individual,Manual,False,False,False,False,True,True,False,False,False,False
1,Maruti Wagon R LXI Minor,2007,135000,50000,Individual,Manual,False,False,False,False,True,True,False,False,False,False
2,Hyundai Verna 1.6 SX,2012,600000,100000,Individual,Manual,False,True,False,False,False,True,False,False,False,False
3,Datsun RediGO T Option,2017,250000,46000,Individual,Manual,False,False,False,False,True,True,False,False,False,False
4,Honda Amaze VX i-DTEC,2014,450000,141000,Individual,Manual,False,True,False,False,False,False,False,True,False,False


# 2. K-1 One Hot Encoding
Dropping the first colums of the encodded columns for each feature.

In [162]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True).head()

Unnamed: 0,name,year,selling_price,km_driven,seller_type,transmission,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC,2007,60000,70000,Individual,Manual,False,False,False,True,False,False,False,False
1,Maruti Wagon R LXI Minor,2007,135000,50000,Individual,Manual,False,False,False,True,False,False,False,False
2,Hyundai Verna 1.6 SX,2012,600000,100000,Individual,Manual,True,False,False,False,False,False,False,False
3,Datsun RediGO T Option,2017,250000,46000,Individual,Manual,False,False,False,True,False,False,False,False
4,Honda Amaze VX i-DTEC,2014,450000,141000,Individual,Manual,True,False,False,False,False,True,False,False


# 3. One Hot Encoding using sklearn

In [163]:
df = df[['name', 'km_driven', 'fuel', 'owner']]

In [164]:
df.head()

Unnamed: 0,name,km_driven,fuel,owner
0,Maruti 800 AC,70000,Petrol,First Owner
1,Maruti Wagon R LXI Minor,50000,Petrol,First Owner
2,Hyundai Verna 1.6 SX,100000,Diesel,First Owner
3,Datsun RediGO T Option,46000,Petrol,First Owner
4,Honda Amaze VX i-DTEC,141000,Diesel,Second Owner


In [165]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [166]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:,0:4], df.iloc[:, -1], test_size=0.2)

In [192]:
ohe = OneHotEncoder(drop='first', sparse_output=False)
# sparse=False makes OneHotEncoder return a normal dense NumPy array instead of a sparse matrix to save memory
# if sparse is not set to False, then we have to use toarray() method in transformed data.

In [193]:
ohe.fit(df[['fuel', 'owner']])

In [194]:
X_train_encoded = ohe.transform(X_train[['fuel', 'owner']])

In [195]:
X_train_encoded

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [196]:
X_test_encoded = ohe.transform(X_test[['fuel', 'owner']]) 

In [197]:
X_test_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [198]:
X_test_encoded.shape

(868, 8)

In [214]:
import numpy as np

In [207]:
df2 = np.hstack((X_train[['name', 'km_driven']].values, X_train_encoded))

In [219]:
new_df = pd.DataFrame(df2)

In [220]:
new_df.head() # Here the column name got lost while performing np.hstack()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Chevrolet Spark 1.0 LT BS3,60000,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,Maruti Alto STD,50000,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,Mahindra KUV 100 G80 K2,5000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Hyundai i10 Magna 1.1,90000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,Maruti Alto LXI,5000,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


# 4. One Hot Encoding with top categories


In [221]:
df.head()

Unnamed: 0,name,km_driven,fuel,owner
0,Maruti 800 AC,70000,Petrol,First Owner
1,Maruti Wagon R LXI Minor,50000,Petrol,First Owner
2,Hyundai Verna 1.6 SX,100000,Diesel,First Owner
3,Datsun RediGO T Option,46000,Petrol,First Owner
4,Honda Amaze VX i-DTEC,141000,Diesel,Second Owner


In [226]:
counts = df['name'].value_counts()

In [228]:
counts

name
Maruti Swift Dzire VDI                     69
Maruti Alto 800 LXI                        59
Maruti Alto LXi                            47
Maruti Alto LX                             35
Hyundai EON Era Plus                       35
                                           ..
Hyundai Verna Transform CRDi VGT SX ABS     1
Maruti S-Presso VXI Plus                    1
Toyota Etios Liva 1.2 VX                    1
Toyota Yaris G                              1
Hyundai i20 Magna 1.4 CRDi                  1
Name: count, Length: 1491, dtype: int64

In [227]:
df['name'].nunique()

1491

In [267]:
threshold=20

In [268]:
rare_names = counts[counts <= threshold].index

In [272]:
rare_names

Index(['Hyundai Creta 1.6 CRDi SX', 'Renault Duster 85PS Diesel RxL',
       'Renault KWID 1.0 RXT Optional', 'Chevrolet Beat Diesel LT',
       'Maruti SX4 Vxi BSIV', 'Chevrolet Beat Diesel LS',
       'Hyundai Verna 1.6 SX', 'Chevrolet Spark 1.0 LS',
       'Tata Indica GLS BS IV', 'Hyundai Verna 1.6 SX CRDi (O)',
       ...
       'Ford Figo Petrol LXI', 'Mahindra XUV500 W6 1.99 mHawk',
       'Maruti Alto K10 LX', 'Maruti Wagon R VX', 'Mahindra NuvoSport N8',
       'Hyundai Verna Transform CRDi VGT SX ABS', 'Maruti S-Presso VXI Plus',
       'Toyota Etios Liva 1.2 VX', 'Toyota Yaris G',
       'Hyundai i20 Magna 1.4 CRDi'],
      dtype='object', name='name', length=1473)

In [274]:
pd.get_dummies(df['name'].replace(rare_names, 'uncommon'), dtype=int)

Unnamed: 0,Hyundai EON Era Plus,Hyundai EON Magna Plus,Hyundai Santro Xing GLS,Hyundai i10 Magna,Mahindra XUV500 W8 2WD,Maruti 800 AC,Maruti Alto 800 LXI,Maruti Alto K10 VXI,Maruti Alto LX,Maruti Alto LXi,Maruti Ritz VDi,Maruti Swift Dzire VDI,Maruti Swift VDI,Maruti Swift VDI BSIV,Maruti Wagon R LXI,Maruti Wagon R LXI Minor,Maruti Wagon R VXI BS IV,Renault KWID RXT,uncommon
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4336,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4337,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# üìå What is ColumnTransformer?
#### ColumnTransformer is a sklearn tool used to apply different preprocessing steps to different columns of a dataset at the same time.

For example: You can apply SimpleImputer, OneHotEncoder, and OrindalEncoder at the same time.

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('covid_toy.csv')

In [20]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,52.0,Female,Yes,Yes,Bangalore,Yes
1,15.0,Male,No,No,Chennai,No
2,72.0,Male,Yes,Yes,Kolkata,No
3,61.0,Male,Yes,No,Delhi,Yes
4,21.0,Male,Yes,Yes,Kolkata,Yes


In [14]:
df.nunique()

age          53
gender        2
fever         2
cough         2
city          5
has_covid     2
dtype: int64

In [19]:
df['city'].value_counts()

city
Kolkata      26
Chennai      25
Mumbai       20
Delhi        16
Bangalore    13
Name: count, dtype: int64

In [24]:
ct = ColumnTransformer(
    transformers=[
        ('filling_missing', SimpleImputer(), ['age']),
        ('categorical', OneHotEncoder(), ['gender', 'city'])
    ],
    remainder='drop'
)

In [40]:

X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:, 0:5], df.iloc[:, -1], test_size=0.2)

In [42]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
31,80.0,Female,Yes,Yes,Mumbai
40,3.0,Female,No,No,Kolkata
28,59.0,Female,No,Yes,Chennai
90,26.0,Male,Yes,Yes,Chennai
67,,Male,No,No,Mumbai


In [44]:
X_transformed = ct.fit_transform(X_train)

In [45]:
X_transformed

array([[80.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 3.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [59.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [26.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [47.56060606,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [22.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [62.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [47.56060606,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [21.        ,  0.        

In [46]:
feature_names = ct.get_feature_names_out()
print(feature_names)

['filling_missing__age' 'categorical__gender_Female'
 'categorical__gender_Male' 'categorical__city_Bangalore'
 'categorical__city_Chennai' 'categorical__city_Delhi'
 'categorical__city_Kolkata' 'categorical__city_Mumbai']


In [47]:
X_transformed_df = pd.DataFrame(
    X_transformed,
    columns=feature_names
)

print(X_transformed_df.head())


   filling_missing__age  categorical__gender_Female  categorical__gender_Male  \
0             80.000000                         1.0                       0.0   
1              3.000000                         1.0                       0.0   
2             59.000000                         1.0                       0.0   
3             26.000000                         0.0                       1.0   
4             47.560606                         0.0                       1.0   

   categorical__city_Bangalore  categorical__city_Chennai  \
0                          0.0                        0.0   
1                          0.0                        0.0   
2                          0.0                        1.0   
3                          0.0                        1.0   
4                          0.0                        0.0   

   categorical__city_Delhi  categorical__city_Kolkata  \
0                      0.0                        0.0   
1                      0.0      

# üìå What Is a Pipeline?
#### A Pipeline in sklearn is a tool that allows you to chain multiple steps together so they run in order.

### If you need multiple steps on same column,
#### üëâ Use Pipeline inside ColumnTransformer.

### If you need different steps on different columns,
#### üëâ Use ColumnTransformer.

### Example:

```sh
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

ct = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, ['age']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['gender', 'city'])
    ],
    remainder='drop'
)
```
