In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

# Forming Dataset
np.random.seed(42)
brand = np.random.choice(["Maruti", "Scoda", "BMW", "Range Rover"], 1000)
km_driven = np.random.uniform(10000, 50000, size=1000)
fuel = np.random.choice(["Petrol", "Diesel", "CNG", "LPG"], 1000)
owner = np.random.choice(["First", "Second", "Third", "Fourth"], 1000)
selling_price = np.random.uniform(100000, 500000, size=1000)

df = pd.DataFrame({
    "brand": brand,
    "km_driven": km_driven,
    "fuel": fuel,
    "owner": owner,
    "selling_price": selling_price
})
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,BMW,37926.468561,LPG,Fourth,431007.568843
1,Range Rover,31443.854654,Diesel,Third,405811.117862
2,Maruti,22381.104651,Diesel,Third,329411.580582
3,BMW,42551.800788,LPG,Third,482418.857445
4,BMW,37389.246902,Petrol,Fourth,180189.806211


In [2]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['selling_price']), df['selling_price'], test_size=0.2, random_state=42)

---

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

transformer = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(categories=[["First", "Second", "Third", "Fourth"]]), ['owner']),
        ("onehot", OneHotEncoder(categories=[["Maruti", "Scoda", "BMW", "Range Rover"], ["Petrol", "Diesel", "CNG", "LPG"]], sparse_output=False), ['brand', 'fuel']) # OneHotEncoder returns a sparse matrix by default, we set it to False to get a dense matrix
    ],
    remainder='passthrough', # remainder='passthrough' to keep the columns that are not transformed
    verbose=True, # verbose=True to print the progress
    verbose_feature_names_out=True, # verbose_feature_names_out=True to get the feature names in the output
)

# setting to get a pandas df
transformer.set_output(transform='pandas')

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,True
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['First', 'Second', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['Maruti', 'Scoda', ...], ['Petrol', 'Diesel', ...]]"
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [4]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

[ColumnTransformer] ....... (1 of 3) Processing ordinal, total=   0.0s
[ColumnTransformer] ........ (2 of 3) Processing onehot, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s


In [7]:
X_train_transformed.head()

Unnamed: 0,ordinal__owner,onehot__brand_Maruti,onehot__brand_Scoda,onehot__brand_BMW,onehot__brand_Range Rover,onehot__fuel_Petrol,onehot__fuel_Diesel,onehot__fuel_CNG,onehot__fuel_LPG,remainder__km_driven
29,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,44588.89505
535,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,37851.551035
695,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,44861.472246
557,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,26077.236544
836,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,17981.698037


In [8]:
transformer.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [9]:
transformer.get_feature_names_out()

array(['ordinal__owner', 'onehot__brand_Maruti', 'onehot__brand_Scoda',
       'onehot__brand_BMW', 'onehot__brand_Range Rover',
       'onehot__fuel_Petrol', 'onehot__fuel_Diesel', 'onehot__fuel_CNG',
       'onehot__fuel_LPG', 'remainder__km_driven'], dtype=object)

In [10]:
transformer.n_features_in_

4

In [11]:
transformer.transformers_

[('ordinal',
  OrdinalEncoder(categories=[['First', 'Second', 'Third', 'Fourth']]),
  ['owner']),
 ('onehot',
  OneHotEncoder(categories=[['Maruti', 'Scoda', 'BMW', 'Range Rover'],
                            ['Petrol', 'Diesel', 'CNG', 'LPG']],
                sparse_output=False),
  ['brand', 'fuel']),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  ['km_driven'])]

In [12]:
transformer.output_indices_

{'ordinal': slice(0, 1, None),
 'onehot': slice(1, 9, None),
 'remainder': slice(9, 10, None)}

In [19]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Sample data
df = pd.DataFrame({
    'age': [25, 30, 35, np.nan, 40],
    'salary': [50000, 60000, 70000, 80000, 90000],
    'city': ['NYC', 'LA', 'NYC', 'Chicago', 'LA']
})

# Apply multiple transformations to the same column
preprocessor = make_column_transformer(
    (StandardScaler(), ['age']),        # First transformation
    (MinMaxScaler(), ['age']),          # Second transformation (same column)
    (SimpleImputer(strategy='median'), ['age']),  # Third transformation
    remainder='passthrough'
)
preprocessor.set_output(transform='pandas')

# This will create multiple columns for 'age'
transformed = preprocessor.fit_transform(df)
print(transformed.shape)  # (5, 5) - original had 3 columns
transformed.head()

(5, 5)


Unnamed: 0,standardscaler__age,minmaxscaler__age,simpleimputer__age,remainder__salary,remainder__city
0,-1.341641,0.0,25.0,50000,NYC
1,-0.447214,0.333333,30.0,60000,LA
2,0.447214,0.666667,35.0,70000,NYC
3,,,32.5,80000,Chicago
4,1.341641,1.0,40.0,90000,LA


---

In [None]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
df.shape

(8128, 5)

In [None]:
import numpy as np

np.random.seed(42)
missing_km_indices = np.random.choice(df.index, size=int(0.05*len(df)), replace=False)
df.loc[missing_km_indices, 'km_driven'] = np.nan

# Introduce missing values in 'owner' column (1% missing values)
missing_owner_indices = np.random.choice(df.index, size=int(0.01*len(df)), replace=False)
df.loc[missing_owner_indices, 'owner'] = np.nan



In [None]:
df.isnull().sum()

brand              0
km_driven        406
fuel               0
owner             81
selling_price      0
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                      df.drop(columns=['selling_price']),
                                                      df['selling_price'],
                                                      test_size=0.2,
                                                      random_state=42
                                                    )

In [None]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560.0,Petrol,First Owner
6144,Honda,80000.0,Petrol,Second Owner
6381,Hyundai,150000.0,Diesel,Fourth & Above Owner
438,Maruti,120000.0,Diesel,Second Owner
5939,Maruti,25000.0,Petrol,First Owner


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6502 entries, 6518 to 7270
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   brand      6502 non-null   object 
 1   km_driven  6502 non-null   float64
 2   fuel       6502 non-null   object 
 3   owner      6442 non-null   object 
dtypes: float64(1), object(3)
memory usage: 254.0+ KB


In [None]:
# Plan of Attack

# Missing value imputation
# Encoding Categorical Variables
# Scaling
# Feature Selection
# Model building
# Prediction

In [None]:
df['owner'].value_counts()

First Owner             5235
Second Owner            2085
Third Owner              549
Fourth & Above Owner     173
Test Drive Car             5
Name: owner, dtype: int64

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_km_driven',SimpleImputer(),[1]),
    ('impute_owner',SimpleImputer(strategy='most_frequent'),[3])
],remainder='passthrough')

In [None]:
# encoding categorical variables
trf2 = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [3]),
        ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,2])
    ],
    remainder='passthrough'
)

In [None]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,38))
])

In [None]:
a = [1,2,3,4,5]
x = slice(0,5)
a[x]

[1, 2, 3, 4, 5]

In [None]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [None]:
# train the model
trf5 = RandomForestRegressor()

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imputer',trf1),
    ('encoder',trf2),
    ('scaler',trf3),
    ('fselector',trf4),
    ('model',trf5)
])


In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [None]:
pipe.named_steps

{'imputer': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_km_driven', SimpleImputer(), [1]),
                                 ('impute_owner',
                                  SimpleImputer(strategy='most_frequent'),
                                  [3])]),
 'encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('ordinal', OrdinalEncoder(), [3]),
                                 ('onehot', OneHotEncoder(sparse_output=False),
                                  [0, 2])]),
 'scaler': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 37, None))]),
 'fselector': SelectKBest(score_func=<function chi2 at 0x7fe5a1b6ea70>),
 'model': RandomForestRegressor()}

In [None]:
pipe.named_steps['scaler'].transformers_[0][1].data_max_

array([3., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [None]:
pipe.predict(X_test)[10:40]

array([771854.48794078, 771854.48794078, 441301.94073954, 771854.48794078,
       771854.48794078, 771854.48794078, 441301.94073954, 441301.94073954,
       771854.48794078, 441301.94073954, 771854.48794078, 441301.94073954,
       771854.48794078, 937130.76154139, 441301.94073954, 771854.48794078,
       771854.48794078, 771854.48794078, 771854.48794078, 441301.94073954,
       771854.48794078, 606578.21434016, 441301.94073954, 771854.48794078,
       771854.48794078, 771854.48794078, 441301.94073954, 771854.48794078,
       771854.48794078, 441301.94073954])

In [None]:
# Predict
pipe.predict(np.array(['Maruti',100000.0,'Diesel','First Owner']).reshape(1,4))



array([630635.02066264])

### Cross Validation

In [None]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-639139970114.0674

### Hyperparameter Tuning

In [None]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

-639082881304.7794

In [None]:
grid.best_params_

{'model__max_depth': None}

### Export the Pipeline

In [None]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))