## 1.Importing data

In [40]:
# Importing the necessary libraries
import pandas as pd
import numpy as np

In [41]:
# loading dataset into pandas dataframe
cars_df = pd.read_csv('data/Car details v3.csv')
cars_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


## 2.Removing duplicates rows 

In [42]:
cars_df.duplicated().sum()

1202

In [43]:
cars_df.drop_duplicates(inplace=True)
cars_df.duplicated().sum()

0

In [44]:
cars_df.seats.value_counts()

seats
5.0     5254
7.0      966
8.0      222
4.0      124
9.0       74
6.0       57
10.0      18
2.0        2
14.0       1
Name: count, dtype: int64

## 3.Data preprocessing 

In [45]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6926 entries, 0 to 8125
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   year           6926 non-null   int64  
 2   selling_price  6926 non-null   int64  
 3   km_driven      6926 non-null   int64  
 4   fuel           6926 non-null   object 
 5   seller_type    6926 non-null   object 
 6   transmission   6926 non-null   object 
 7   owner          6926 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6721 non-null   object 
 11  torque         6717 non-null   object 
 12  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 757.5+ KB


In [46]:
cars_df['brand_model'] = cars_df.name.apply(lambda x:' '.join(x.split(' ')[0:2]).lower())
cars_df['max_power'] = cars_df['max_power'].replace(' bhp', np.nan)
cars_df['max_power_bhp'] = cars_df.max_power.apply(lambda x:float(x.split()[0]) if x is not np.nan else x)
cars_df['engine_cc'] = cars_df.max_power.apply(lambda x:float(x.split()[0]) if x is not np.nan else x)
cars_df['mileage_kmpl'] = cars_df.max_power.apply(lambda x:float(x.split()[0]) if x is not np.nan else x)
cars_df = cars_df.drop(columns=['max_power','engine', 'mileage', 'name'])
cars_df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,torque,seats,brand_model,max_power_bhp,engine_cc,mileage_kmpl
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,190Nm@ 2000rpm,5.0,maruti swift,74.0,74.0,74.0
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,250Nm@ 1500-2500rpm,5.0,skoda rapid,103.52,103.52,103.52
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,"12.7@ 2,700(kgm@ rpm)",5.0,honda city,78.0,78.0,78.0
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,22.4 kgm at 1750-2750rpm,5.0,hyundai i20,90.0,90.0,90.0
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,"11.5@ 4,500(kgm@ rpm)",5.0,maruti swift,88.2,88.2,88.2


In [47]:
# identifying most frequent catagories for onehot multiclass encoding
(cars_df.brand_model.value_counts() > 50).sum() 

39

In [48]:
# numerical columns
num_cols = ['km_driven','year','max_power_bhp','engine_cc','mileage_kmpl']

# Catagorical columns
ordinal_encoded = ['transmission','owner']
target_encoded = ['seats']
one_hot_binary = ['fuel','seller_type']
one_hot_multiclass = ['brand_model']

In [49]:
# droping rows with nan values
cars_df.dropna(inplace=True)

# seperating features and target as X and y
X = cars_df.drop(['selling_price'], axis=1)
y = cars_df['selling_price']
X.shape, y.shape

((6717, 12), (6717,))

## 4.Splitting data into train and test sets

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5373, 12), (1344, 12), (5373,), (1344,))

## 5.Creating preprocessor with pipelines and columntransformer to handle catagorical and numerical columns

In [53]:
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

target_encoded_pipeline = Pipeline([
    ('target_encoding', TargetEncoder(smooth=0.5,target_type="continuous")),
#    ('standard_scaler',StandardScaler())
])

num_pipeline = Pipeline([
    ('standard_scaler', MinMaxScaler())    
])

ordinal_encoded_pipeline = Pipeline([
    ('ordinal_encoding',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-2)),
#    ('standard_scaler',StandardScaler())
])

onehot_encoded_pipeline = Pipeline([
    ('one_hot_encoding', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

onehot_multiclass_pipeline = Pipeline([
    ('one_hot_multiclass', OneHotEncoder(max_categories=50,sparse_output=False,handle_unknown='ignore'))
])

preprocessor_ct = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('target_encoder', target_encoded_pipeline, target_encoded),
    ('ordinal_encoder', ordinal_encoded_pipeline, ordinal_encoded),
    ('one_hot_encoder', onehot_encoded_pipeline, one_hot_binary),
    ('one_hot_multiclass', onehot_multiclass_pipeline, one_hot_multiclass)
])

preprocessor_ct

## 6.Fit transform above preprocessor on train data and also transform test data

In [54]:
preprocessor_ct.fit_transform(X_train,y_train)

array([[0.03348228, 0.76923077, 0.13398693, ..., 0.        , 0.        ,
        0.        ],
       [0.0144101 , 0.88461538, 0.18028322, ..., 0.        , 0.        ,
        0.        ],
       [0.02670106, 0.73076923, 0.21955338, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.005742  , 0.96153846, 0.13344227, ..., 0.        , 0.        ,
        0.        ],
       [0.04195881, 0.76923077, 0.1791939 , ..., 0.        , 0.        ,
        0.        ],
       [0.06738839, 0.5       , 0.28676471, ..., 0.        , 0.        ,
        0.        ]])

In [55]:
X_train_processed_df = pd.DataFrame(preprocessor_ct.fit_transform(X_train,y_train), columns=preprocessor_ct.get_feature_names_out())
X_train_processed_df.head()

Unnamed: 0,num_pipeline__km_driven,num_pipeline__year,num_pipeline__max_power_bhp,num_pipeline__engine_cc,num_pipeline__mileage_kmpl,target_encoder__seats,ordinal_encoder__transmission,ordinal_encoder__owner,one_hot_encoder__fuel_CNG,one_hot_encoder__fuel_Diesel,...,one_hot_multiclass__brand_model_tata new,one_hot_multiclass__brand_model_tata nexon,one_hot_multiclass__brand_model_tata tiago,one_hot_multiclass__brand_model_tata zest,one_hot_multiclass__brand_model_toyota etios,one_hot_multiclass__brand_model_toyota fortuner,one_hot_multiclass__brand_model_toyota innova,one_hot_multiclass__brand_model_volkswagen polo,one_hot_multiclass__brand_model_volkswagen vento,one_hot_multiclass__brand_model_infrequent_sklearn
0,0.033482,0.769231,0.133987,0.133987,0.133987,485488.864955,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.01441,0.884615,0.180283,0.180283,0.180283,478776.995145,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.026701,0.730769,0.219553,0.219553,0.219553,485021.441286,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.010172,0.923077,0.112255,0.112255,0.112255,482806.669858,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.037721,0.730769,0.1122,0.1122,0.1122,494097.337935,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
features_dict = {}
for transformer in preprocessor_ct.transformers_[1:-1]:
    features_dict.update(zip(transformer[2], transformer[1][0].categories_))
    #print(transformer[1][0])
print(features_dict)

{'seats': array([ 4.,  5.,  6.,  7.,  8.,  9., 10.]), 'transmission': array(['Automatic', 'Manual'], dtype=object), 'owner': array(['First Owner', 'Fourth & Above Owner', 'Second Owner',
       'Test Drive Car', 'Third Owner'], dtype=object), 'fuel': array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object), 'seller_type': array(['Dealer', 'Individual', 'Trustmark Dealer'], dtype=object), 'brand_model': array(['ambassador classic', 'ambassador grand', 'ashok leyland',
       'audi a3', 'audi a4', 'audi a6', 'audi q3', 'audi q5', 'audi q7',
       'bmw 3', 'bmw 5', 'bmw 6', 'bmw 7', 'bmw x1', 'bmw x3', 'bmw x4',
       'bmw x5', 'bmw x6', 'bmw x7', 'chevrolet aveo', 'chevrolet beat',
       'chevrolet captiva', 'chevrolet cruze', 'chevrolet enjoy',
       'chevrolet optra', 'chevrolet sail', 'chevrolet spark',
       'chevrolet tavera', 'chevrolet trailblazer', 'daewoo matiz',
       'datsun go', 'datsun redigo', 'fiat avventura', 'fiat grande',
       'fiat linea', 'fiat punto', 'force o

In [57]:
X_test_processed_df = pd.DataFrame(preprocessor_ct.transform(X_test), columns=preprocessor_ct.get_feature_names_out())
X_test_processed_df.head()

Unnamed: 0,num_pipeline__km_driven,num_pipeline__year,num_pipeline__max_power_bhp,num_pipeline__engine_cc,num_pipeline__mileage_kmpl,target_encoder__seats,ordinal_encoder__transmission,ordinal_encoder__owner,one_hot_encoder__fuel_CNG,one_hot_encoder__fuel_Diesel,...,one_hot_multiclass__brand_model_tata new,one_hot_multiclass__brand_model_tata nexon,one_hot_multiclass__brand_model_tata tiago,one_hot_multiclass__brand_model_tata zest,one_hot_multiclass__brand_model_toyota etios,one_hot_multiclass__brand_model_toyota fortuner,one_hot_multiclass__brand_model_toyota innova,one_hot_multiclass__brand_model_volkswagen polo,one_hot_multiclass__brand_model_volkswagen vento,one_hot_multiclass__brand_model_infrequent_sklearn
0,0.065269,0.807692,0.356481,0.356481,0.356481,485230.322906,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.006095,0.884615,0.151688,0.151688,0.151688,485230.322906,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.004662,0.692308,0.086057,0.086057,0.086057,485230.322906,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.025006,0.923077,0.385893,0.385893,0.385893,801762.562801,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.028872,0.884615,0.373638,0.373638,0.373638,485230.322906,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 7.Finding the best model for the dataset by training a list of models

In [58]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error

def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    r2Score_train = r2_score(y_train, y_pred_train) 
    r2Score_test = r2_score(y_test, y_pred_test)
    root_mean_squared_error_train = root_mean_squared_error(y_train, y_pred_train) 
    root_mean_squared_error_test = root_mean_squared_error(y_test, y_pred_test)
    mean_absolute_error_train = mean_absolute_error(y_train, y_pred_train)  
    mean_absolute_error_test =  mean_absolute_error(y_test, y_pred_test)
    mean_squared_error_train = mean_squared_error(y_train, y_pred_train)
    mean_squared_error_test = mean_squared_error(y_test, y_pred_test)

    print(f"r2score ----> train: {r2Score_train} test: {r2Score_test}")
    print(f"rmse: ----> train: {root_mean_squared_error_train} test: {root_mean_squared_error_test}")
    print(f"mae: ----> train: {mean_absolute_error_train} test: {mean_absolute_error_test}")
    print(f"mse: ----> train: {mean_squared_error_train} test: {mean_squared_error_test}")


models_dict = {"Linear Regression": LinearRegression(),
               "Lasso": Lasso(),
               "Ridge": Ridge(),
               "K-Neihbors Regressor": KNeighborsRegressor(),
               "Decision Tree": DecisionTreeRegressor(),
               "Random Forest Regressor": RandomForestRegressor(),
               "XGBRegressor": XGBRegressor(),
               "Catboosting Regressor": CatBoostRegressor(verbose=False),
               "AdaBoost Regressor": AdaBoostRegressor()
               }

trained_models = []
for model_name,model in models_dict.items():
    model.fit(X_train_processed_df, y_train)
    trained_models.append(model)
    print("------------------------"+model_name+"------------------------")
    evaluate_model(model, X_train_processed_df, y_train, X_test_processed_df, y_test)
    print()


------------------------Linear Regression------------------------
r2score ----> train: 0.6759639177904575 test: -26331228.485743463
rmse: ----> train: 305296.2758469457 test: 2403722547.9542823
mae: ----> train: 155488.61522181233 test: 131280410.52766791
mse: ----> train: 93205816046.01436 test: 5.777882087543826e+18



  model = cd_fast.enet_coordinate_descent(


------------------------Lasso------------------------
r2score ----> train: 0.6759639359418218 test: 0.7331825924674551
rmse: ----> train: 305296.26729613246 test: 241966.83753255475
mae: ----> train: 155483.70124931206 test: 147612.37729275879
mse: ----> train: 93205810824.95157 test: 58547950465.50575

------------------------Ridge------------------------
r2score ----> train: 0.6756461733751022 test: 0.7329075065248252
rmse: ----> train: 305445.9234062624 test: 242091.53803269155
mae: ----> train: 154603.91346696106 test: 146778.85939408012
mse: ----> train: 93297212125.50433 test: 58608312787.034134

------------------------K-Neihbors Regressor------------------------
r2score ----> train: 0.7277282548635939 test: 0.5204167949515548
rmse: ----> train: 279850.5132606644 test: 324399.9295094991
mae: ----> train: 112551.7221663875 test: 138238.91979166665
mse: ----> train: 78316309772.25731 test: 105235314265.76799

------------------------Decision Tree------------------------
r2score --

## 8. Hyperparameter tuning of the best model using RandomizedSearchCV

In [59]:
from sklearn.model_selection import RandomizedSearchCV

params = {'n_estimators':[5, 25, 50, 75, 100, 150, 200, 210, 220, 230, 240, 250, 300],
          'criterion':['absolute_error', 'friedman_mse', 'poisson','squared_error'],
          'max_depth':[None, 2, 4, 6, 8, 10],
          'min_samples_split':[2, 4, 8, 16],
          'min_samples_leaf':[1, 2, 4, 8, 16],
          'max_features':['sqrt', 'log2', None],
          'max_samples':[0.1, 0.2, 0.4, 0.8, 1.0],
          'warm_start':[True, False]}

rcv = RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=-1), param_distributions=params, n_iter=200, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True)
rcv.fit(X_train_processed_df, y_train)

## 9.Evaluating the finetuned model

In [60]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

rmse_train = root_mean_squared_error(y_train, rcv.predict(X_train_processed_df))
r2Score_train = r2_score(y_train, rcv.predict(X_train_processed_df))
rmse_train, r2Score_train

(130203.88969746318, 0.9410616088805449)

In [61]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

predicted_price = rcv.predict(X_test_processed_df)
rmse_test = root_mean_squared_error(y_test, predicted_price)
r2Score_test = r2_score(y_test, predicted_price)
rmse_test, r2Score_test

(135449.56691017968, 0.9163900998266229)

In [62]:
predicted_price_df = pd.DataFrame(predicted_price,index=y_test.index, columns=['predicted selling price'])
final_df = pd.concat([y_test.to_frame(), predicted_price_df],axis=1).sort_index()
final_df

Unnamed: 0,selling_price,predicted selling price
8,350000,287680.675370
15,400000,421390.812061
16,778000,725715.364767
18,150000,95141.451556
20,174000,208765.476787
...,...,...
8094,200000,337109.420524
8107,325000,406177.530304
8110,425000,379760.939075
8113,425000,617992.541231


In [63]:
rcv.best_params_

{'warm_start': True,
 'n_estimators': 150,
 'min_samples_split': 8,
 'min_samples_leaf': 2,
 'max_samples': 0.8,
 'max_features': None,
 'max_depth': 10,
 'criterion': 'friedman_mse'}

In [64]:
rcv_results_df = pd.DataFrame(rcv.cv_results_)
rcv_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_warm_start,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_samples,param_max_features,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1.682111,0.018006,0.029366,0.006165,False,75,16,16,0.8,,...,-251434.285094,22424.821061,52,-243025.242079,-236974.552809,-247380.481556,-234635.259297,-244168.944820,-241236.896112,4717.794233
1,0.174280,0.033925,0.041152,0.010945,False,50,16,16,0.4,log2,...,-361260.691989,31612.680368,154,-364249.521172,-338331.704720,-374821.180828,-343491.984511,-365446.774719,-357268.233190,13943.529197
2,0.203874,0.010284,0.049824,0.005860,False,220,2,8,0.1,sqrt,...,-348185.535535,28516.532785,145,-353572.060633,-346906.815367,-349471.426051,-336327.255226,-345408.373490,-346337.186153,5719.313876
3,0.023771,0.002250,0.017489,0.001155,False,5,4,4,0.8,log2,...,-340782.806238,42489.570767,139,-304066.406808,-337264.996589,-330905.206225,-323350.399990,-351571.727424,-329431.747407,15710.460712
4,1.361907,0.010703,0.058470,0.004137,False,300,16,8,1.0,log2,...,-394438.570058,30567.380727,178,-400305.655591,-388683.656037,-395023.893633,-385629.421589,-387154.938707,-391359.513112,5497.886062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.316704,0.020994,0.089059,0.011881,True,200,2,8,0.4,sqrt,...,-280508.274084,29381.200020,74,-274454.816175,-266491.901556,-271885.926319,-261912.208276,-275594.705712,-270067.911608,5145.850891
196,0.152949,0.010313,0.059006,0.009964,False,50,16,8,0.8,,...,-208117.466338,25650.592870,14,-183693.332703,-178947.261286,-191638.862768,-177577.812442,-187657.053612,-183902.864562,5263.986995
197,0.379198,0.016970,0.087867,0.016959,False,230,2,4,0.1,,...,-321262.077106,18731.216995,112,-315790.911592,-316646.191939,-316058.127796,-303925.602320,-319018.932079,-314287.953145,5304.981977
198,0.136473,0.021582,0.026206,0.003010,False,5,8,2,0.4,log2,...,-305416.908382,35726.366357,104,-321768.886989,-284607.768467,-308484.715285,-281572.964097,-273551.302169,-293997.127402,18120.223496
