# Lecture 35: Column Transformer

Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split # train test split

from sklearn.impute import SimpleImputer # Imputation

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler # Encoding & scaling


Importing Dataframe

In [2]:
df=pd.read_csv('car_price_prediction.csv',usecols=['Price','Category','Leather interior','Airbags'])

df.loc[20:2000, 'Airbags'] = np.nan

df.head()


Unnamed: 0,Price,Category,Leather interior,Airbags
0,13328,Jeep,Yes,12.0
1,16621,Jeep,No,8.0
2,8467,Hatchback,No,2.0
3,3607,Jeep,Yes,0.0
4,11726,Hatchback,Yes,4.0


In [3]:
print(df.shape)

(19237, 4)


## Train Test Split

In [4]:
df.isnull().sum()

Price                  0
Category               0
Leather interior       0
Airbags             1981
dtype: int64

In [5]:
X=df.drop(columns=['Price'])
y=df.Price

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

### Preprocessing (Without Column Transformer)

Imputing

In [6]:
SI_mean=SimpleImputer(strategy='mean')

X_train_Airbags=SI_mean.fit_transform(X_train[['Airbags']])
X_test_Airbags=SI_mean.transform(X_test[['Airbags']])

In [7]:
X_train_Airbags.shape

(15389, 1)

Ordinal Encoding

In [8]:
OE=OrdinalEncoder(dtype='int',handle_unknown='error',categories=[['No','Yes']])

X_train_Leather_interior=OE.fit_transform(X_train[['Leather interior']])
X_test_Leather_interior=OE.transform(X_test[['Leather interior']])
X_train_Leather_interior.shape

(15389, 1)

One Hot Encoding

In [9]:
OHE=OneHotEncoder(sparse_output=False,dtype='int',drop='first')

X_train_Category=OHE.fit_transform(X_train[['Category']])

X_test_Category=OHE.transform(X_test[['Category']])

Normalization (Feature Scaling)

In [10]:
Min_max=MinMaxScaler()

X_train_Airbags_scaled=Min_max.fit_transform(X_train[['Airbags']])
X_test_Airbags_scaled=Min_max.transform(X_test[['Airbags']])

In [11]:
X_train_transformed=np.concatenate((X_train_Category,X_train_Airbags,X_train_Airbags_scaled,X_train_Leather_interior),axis=1)

X_test_transformed=np.concatenate((X_test_Category,X_test_Airbags,X_test_Airbags_scaled,X_test_Leather_interior),axis=1)

In [12]:
X_train_transformed.shape

(15389, 13)

array([[ 0.        ,  0.        ,  0.        , ..., 12.        ,
         0.75      ,  1.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        ,  0.        ,  0.        , ...,  4.        ,
         0.25      ,  1.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  6.55277074,
                nan,  1.        ],
       [ 0.        ,  0.        ,  0.        , ...,  4.        ,
         0.25      ,  1.        ],
       [ 0.        ,  0.        ,  0.        , ..., 12.        ,
         0.75      ,  1.        ]])

### Preprocessing (With Column Transformer)

In [13]:
from sklearn.compose import ColumnTransformer

In [14]:
col_transform=ColumnTransformer(
    
    transformers=[
        ('SI',SimpleImputer(strategy='mean'),['Airbags']),
        ('OHE',OneHotEncoder(sparse_output=False,dtype='int',drop='first'),['Category']),
        ('OE',OrdinalEncoder(categories=[['No','Yes']],dtype='int',handle_unknown='error'),['Leather interior']),
        ('Normalize',MinMaxScaler(),['Airbags'])
    ],remainder='passthrough'
)

In [15]:
col_transform.fit_transform(X_train)
col_transform.transform(X_test)

array([[12.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.75      ],
       [12.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.75      ],
       [ 6.55277074,  0.        ,  0.        , ...,  0.        ,
         0.        ,         nan],
       ...,
       [ 6.55277074,  0.        ,  0.        , ...,  0.        ,
         0.        ,         nan],
       [12.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.75      ],
       [ 4.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.25      ]])

In [16]:
col_transform.fit_transform(X_train).shape

(15389, 13)