In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [3]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['selling_price']), df['selling_price'], test_size=0.2, random_state=42)

### The Hard Way!

In [5]:
# apply ordinal encoder to owner
oe = OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner', 'Third Owner', 'Second Owner', 'First Owner']])

X_train_owner = oe.fit_transform(X_train.loc[:,['owner']])
X_test_owner = oe.transform(X_test.loc[:,['owner']])

In [6]:
# convert to df
X_train_owner_df = pd.DataFrame(X_train_owner,columns=oe.get_feature_names_out())
X_test_owner_df = pd.DataFrame(X_test_owner,columns=oe.get_feature_names_out())

In [7]:
X_train_owner_df.head()

Unnamed: 0,owner
0,4.0
1,3.0
2,1.0
3,3.0
4,4.0


In [8]:
# apply ohe to brand and fuel
ohe = OneHotEncoder(sparse_output=False)

X_train_brand_fuel = ohe.fit_transform(X_train[['brand','fuel']])
X_test_brand_fuel = ohe.transform(X_test[['brand','fuel']])

In [9]:
# converting to dataframe
X_train_brand_fuel_df = pd.DataFrame(X_train_brand_fuel, columns=ohe.get_feature_names_out())
X_test_brand_fuel_df = pd.DataFrame(X_test_brand_fuel, columns=ohe.get_feature_names_out())

In [10]:
X_train_brand_fuel_df.head()

Unnamed: 0,brand_Ambassador,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,brand_Force,brand_Ford,...,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Combining the Encoded Columns
X_train_rem = X_train.drop(columns=['brand','fuel','owner'],inplace=True)
X_test_rem = X_test.drop(columns=['brand','fuel','owner'],inplace=True)

X_train = pd.concat([X_train_rem, X_train_owner_df, X_train_brand_fuel_df],axis=1)
X_test = pd.concat([X_test_rem, X_test_owner_df, X_test_brand_fuel_df],axis=1)

In [12]:
X_train.head()

Unnamed: 0,owner,brand_Ambassador,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,brand_Force,...,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### The Easy Way!

In [13]:
from sklearn.compose import ColumnTransformer

In [14]:
df = pd.read_csv('cars.csv')
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['selling_price']), df['selling_price'], test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner


In [15]:
transformer = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner', 'Third Owner', 'Second Owner', 'First Owner']]), ['owner']),
        ("onehot", OneHotEncoder(sparse_output=False), ['brand', 'fuel'])
        # (name, object, list of columns)
    ],
    remainder='passthrough', # or 'drop'
    sparse_threshold = 0.2, 
    n_jobs = -1
    # transformer_weights - Allows you to assign weights to different transformers = {'ordinal': 2, 'onehot': 1} -> What is the use of this?
)

# setting to get a pandas df directly
transformer.set_output(transform='pandas')

In [16]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

X_train_transformed

Unnamed: 0,ordinal__owner,onehot__brand_Ambassador,onehot__brand_Ashok,onehot__brand_Audi,onehot__brand_BMW,onehot__brand_Chevrolet,onehot__brand_Daewoo,onehot__brand_Datsun,onehot__brand_Fiat,onehot__brand_Force,...,onehot__brand_Skoda,onehot__brand_Tata,onehot__brand_Toyota,onehot__brand_Volkswagen,onehot__brand_Volvo,onehot__fuel_CNG,onehot__fuel_Diesel,onehot__fuel_LPG,onehot__fuel_Petrol,remainder__km_driven
6518,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2560
6144,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,80000
6381,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150000
438,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,120000
5939,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,25000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,120000
5390,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,80000
860,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35000
7603,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,27000


In [17]:
transformer.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [18]:
transformer.get_feature_names_out()

array(['ordinal__owner', 'onehot__brand_Ambassador',
       'onehot__brand_Ashok', 'onehot__brand_Audi', 'onehot__brand_BMW',
       'onehot__brand_Chevrolet', 'onehot__brand_Daewoo',
       'onehot__brand_Datsun', 'onehot__brand_Fiat',
       'onehot__brand_Force', 'onehot__brand_Ford', 'onehot__brand_Honda',
       'onehot__brand_Hyundai', 'onehot__brand_Isuzu',
       'onehot__brand_Jaguar', 'onehot__brand_Jeep', 'onehot__brand_Kia',
       'onehot__brand_Land', 'onehot__brand_Lexus', 'onehot__brand_MG',
       'onehot__brand_Mahindra', 'onehot__brand_Maruti',
       'onehot__brand_Mercedes-Benz', 'onehot__brand_Mitsubishi',
       'onehot__brand_Nissan', 'onehot__brand_Opel',
       'onehot__brand_Peugeot', 'onehot__brand_Renault',
       'onehot__brand_Skoda', 'onehot__brand_Tata',
       'onehot__brand_Toyota', 'onehot__brand_Volkswagen',
       'onehot__brand_Volvo', 'onehot__fuel_CNG', 'onehot__fuel_Diesel',
       'onehot__fuel_LPG', 'onehot__fuel_Petrol', 'remainder__km_drive

In [19]:
transformer.n_features_in_

4

In [20]:
transformer.transformers_

[('ordinal',
  OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner',
                              'Third Owner', 'Second Owner', 'First Owner']]),
  ['owner']),
 ('onehot', OneHotEncoder(sparse_output=False), ['brand', 'fuel']),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [1])]

In [21]:
transformer.named_transformers_

{'ordinal': OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner',
                             'Third Owner', 'Second Owner', 'First Owner']]),
 'onehot': OneHotEncoder(sparse_output=False),
 'remainder': FunctionTransformer(accept_sparse=True, check_inverse=False,
                     feature_names_out='one-to-one')}

In [23]:
transformer.named_transformers_['ordinal']

In [25]:
transformer.named_transformers_['ordinal'].categories_ # You can get the information about individual encoder/transformer

[array(['Test Drive Car', 'Fourth & Above Owner', 'Third Owner',
        'Second Owner', 'First Owner'], dtype=object)]

In [22]:
transformer.output_indices_

{'ordinal': slice(0, 1, None),
 'onehot': slice(1, 37, None),
 'remainder': slice(37, 38, None)}

In [27]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
ct = ColumnTransformer(
    [
        ("ordinal_encoder", OrdinalEncoder(), ['owner']),
        ("ohe", OneHotEncoder(sparse_output = False), ['owner'])
    ]
)
ct.set_output(transform = 'pandas')
ct.fit_transform(df.loc[:, ['owner']])

# So basically, if you are using a same column in two different transformation combined in the single column transformer then you dont need to pass the name of that column after the first transformation you just need to pass the original column name.

Unnamed: 0,ordinal_encoder__owner,ohe__owner_First Owner,ohe__owner_Fourth & Above Owner,ohe__owner_Second Owner,ohe__owner_Test Drive Car,ohe__owner_Third Owner
0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.0,0.0,0.0,1.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
8123,0.0,1.0,0.0,0.0,0.0,0.0
8124,1.0,0.0,1.0,0.0,0.0,0.0
8125,0.0,1.0,0.0,0.0,0.0,0.0
8126,0.0,1.0,0.0,0.0,0.0,0.0


---

### Sklearn Pipeline
<i>A combination of different ColumnTransformer</i>

In [39]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
# Introduce missing values in 'km_driven' column (1% missing values)
np.random.seed(42)
missing_km_indices = np.random.choice(df.index, size=int(0.05*len(df)), replace=False)
df.loc[missing_km_indices, 'km_driven'] = np.nan

# Introduce missing values in 'owner' column (1% missing values)
missing_owner_indices = np.random.choice(df.index, size=int(0.01*len(df)), replace=False)
df.loc[missing_owner_indices, 'owner'] = np.nan

In [5]:
df.isnull().sum()

brand              0
km_driven        406
fuel               0
owner             81
selling_price      0
dtype: int64

In [6]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['selling_price']), df['selling_price'], test_size=0.2, random_state=42)

In [7]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560.0,Petrol,First Owner
6144,Honda,80000.0,Petrol,Second Owner
6381,Hyundai,150000.0,Diesel,Fourth & Above Owner
438,Maruti,120000.0,Diesel,Second Owner
5939,Maruti,25000.0,Petrol,First Owner


In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6502 entries, 6518 to 7270
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   brand      6502 non-null   object 
 1   km_driven  6502 non-null   float64
 2   fuel       6502 non-null   object 
 3   owner      6442 non-null   object 
dtypes: float64(1), object(3)
memory usage: 254.0+ KB


### Plan of Attack

- Missing value imputation
- Encoding Categorical Variables
- Scaling
- Feature Selection
- Model building
- Prediction

*One column transformer for each step*

In [40]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [41]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer

In [42]:
# Missing Value Imputation ColumnTransformer
trf1 = ColumnTransformer([
    ('impute_km_driven',SimpleImputer(),[1]), # Providing index instead of column name is recommended because name might be removed after transformation applied.
    ('impute_owner',SimpleImputer(strategy='most_frequent'),[3]) # Filling with mode value
],remainder='passthrough')
# Filling missing values with mean value of that column itself

In [43]:
# Encoding Categorical Variables
trf2 = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [3]),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,2])
    ],
    remainder='passthrough'
)

In [44]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,38))
]) # Python slice function

In [45]:
a = [1,2,3,4,5]
x = slice(0,5)
a[x]

[1, 2, 3, 4, 5]

In [46]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [47]:
# train the model - You cannot create columntransformer of a model
trf5 = RandomForestRegressor()

In [48]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imputer',trf1), # Each tuple is one transformer
    ('encoder',trf2),
    ('scaler',trf3),
    ('fselector',trf4),
    ('model',trf5)
])

In [49]:
pipe.fit(X_train, y_train) # Since you are integrated the model in the pipeline thats why we have to pass y_train as well
# But y_train is not following any other transformation.

In [50]:
pipe.feature_names_in_ # Input Features

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [51]:
pipe.named_steps # Steps in the form of dictonary

{'imputer': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_km_driven', SimpleImputer(), [1]),
                                 ('impute_owner',
                                  SimpleImputer(strategy='most_frequent'),
                                  [3])]),
 'encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('ordinal',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  [3]),
                                 ('onehot',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [0, 2])]),
 'scaler': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 38, None))]),
 'model': RandomForestRegressor()}

In [52]:
pipe.named_steps['imputer'].transformers_[0][1].statistics_ # Value filled with

array([70050.98323593])

In [53]:
pipe.named_steps['scaler'].transformers_[0][1].data_max_

array([3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [54]:
pipe.predict(X_test)
# There might be chance of error due to 'Rare Category'

array([443785.1863436, 779542.0442775, 779542.0442775, ...,
       443785.1863436, 443785.1863436, 443785.1863436], shape=(1626,))

In [55]:
pipe.predict(X_test)[10:40]

array([779542.0442775 , 779542.0442775 , 443785.1863436 , 779542.0442775 ,
       779542.0442775 , 779542.0442775 , 443785.1863436 , 443785.1863436 ,
       779542.0442775 , 443785.1863436 , 779542.0442775 , 443785.1863436 ,
       779542.0442775 , 310391.76026624, 443785.1863436 , 779542.0442775 ,
       779542.0442775 , 779542.0442775 , 779542.0442775 , 443785.1863436 ,
       779542.0442775 , 187364.24005733, 443785.1863436 , 779542.0442775 ,
       779542.0442775 , 779542.0442775 , 443785.1863436 , 779542.0442775 ,
       779542.0442775 , 443785.1863436 ])

In [38]:
# Predict
pipe.predict(pd.DataFrame(np.array(['Maruti',100000.0,'Diesel','First Owner']).reshape(1,4), columns = df.drop(columns='selling_price').columns))

array([631107.85033222])

### Cross Validation

In [56]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

np.float64(-610063479633.5787)

### Hyperparameter Tuning

In [None]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
    # The format is name__parameter-name
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

-639082881304.7794

In [None]:
grid.best_params_

{'model__max_depth': None}

### Export the Pipeline

In [None]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))
# By loading this pipeline you can use it for prediction since the pipeline's model is already trained