## 1.Importing data

In [2]:
# Importing the necessary libraries
import pandas as pd
import numpy as np

In [3]:
# loading dataset into pandas dataframe
cars_df = pd.read_csv('data/Car details v3.csv')
cars_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


## 2.Removing duplicates rows 

In [4]:
cars_df.duplicated().sum()

1202

In [66]:
cars_df.drop_duplicates(inplace=True)
cars_df.duplicated().sum()

0

## 3.Data preprocessing 

In [6]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6926 entries, 0 to 8125
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   year           6926 non-null   int64  
 2   selling_price  6926 non-null   int64  
 3   km_driven      6926 non-null   int64  
 4   fuel           6926 non-null   object 
 5   seller_type    6926 non-null   object 
 6   transmission   6926 non-null   object 
 7   owner          6926 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6721 non-null   object 
 11  torque         6717 non-null   object 
 12  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 757.5+ KB


In [7]:
cars_df['brand_model'] = cars_df.name.apply(lambda x:' '.join(x.split(' ')[0:2]).lower())
cars_df['max_power'] = cars_df['max_power'].replace(' bhp', np.nan)
cars_df['max_power_bhp'] = cars_df.max_power.apply(lambda x:float(x.split()[0]) if x is not np.nan else x)
cars_df['engine_cc'] = cars_df.max_power.apply(lambda x:float(x.split()[0]) if x is not np.nan else x)
cars_df['mileage_kmpl'] = cars_df.max_power.apply(lambda x:float(x.split()[0]) if x is not np.nan else x)
cars_df = cars_df.drop(columns=['max_power','engine', 'mileage', 'name'])
cars_df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,torque,seats,brand_model,max_power_bhp,engine_cc,mileage_kmpl
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,190Nm@ 2000rpm,5.0,maruti swift,74.0,74.0,74.0
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,250Nm@ 1500-2500rpm,5.0,skoda rapid,103.52,103.52,103.52
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,"12.7@ 2,700(kgm@ rpm)",5.0,honda city,78.0,78.0,78.0
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,22.4 kgm at 1750-2750rpm,5.0,hyundai i20,90.0,90.0,90.0
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,"11.5@ 4,500(kgm@ rpm)",5.0,maruti swift,88.2,88.2,88.2


In [8]:
# identifying most frequent catagories for onehot multiclass encoding
(cars_df.brand_model.value_counts() > 50).sum() 

39

In [30]:
# numerical columns
num_cols = ['km_driven','year','max_power_bhp','engine_cc','mileage_kmpl']

# Catagorical columns
ordinal_encoded = ['transmission','owner']
target_encoded = ['seats']
one_hot_binary = ['fuel','seller_type']
one_hot_multiclass = ['brand_model']

In [31]:
# droping rows with nan values
cars_df.dropna(inplace=True)

# seperating features and target as X and y
X = cars_df.drop(['selling_price'], axis=1)
y = cars_df.selling_price
X.shape, y.shape

## 4.Splitting data into train and test sets

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5373, 12), (1344, 12), (5373,), (1344,))

## 5.Creating preprocessor with pipelines and columntransformer to handle catagorical and numerical columns

In [34]:
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

target_encoded_pipeline = Pipeline([
    ('target_encoding', TargetEncoder(smooth=0.5,target_type="continuous")),
#    ('standard_scaler',StandardScaler())
])

num_pipeline = Pipeline([
    ('standard_scaler', MinMaxScaler())    
])

ordinal_encoded_pipeline = Pipeline([
    ('ordinal_encoding',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-2)),
#    ('standard_scaler',StandardScaler())
])

onehot_encoded_pipeline = Pipeline([
    ('one_hot_encoding', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

onehot_multiclass_pipeline = Pipeline([
    ('one_hot_multiclass', OneHotEncoder(max_categories=50,sparse_output=False,handle_unknown='ignore'))
])

preprocessor_ct = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('target_encoder', target_encoded_pipeline, target_encoded),
    ('ordinal_encoder', ordinal_encoded_pipeline, ordinal_encoded),
    ('one_hot_encoder', onehot_encoded_pipeline, one_hot_binary),
    ('one_hot_multiclass', onehot_multiclass_pipeline, one_hot_multiclass)
])

preprocessor_ct

## 6.Fit transform above preprocessor on train data and also transform test data

In [35]:
preprocessor_ct.fit_transform(X_train,y_train)

array([[0.03348228, 0.76923077, 0.13398693, ..., 0.        , 0.        ,
        0.        ],
       [0.0144101 , 0.88461538, 0.18028322, ..., 0.        , 0.        ,
        0.        ],
       [0.02670106, 0.73076923, 0.21955338, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.005742  , 0.96153846, 0.13344227, ..., 0.        , 0.        ,
        0.        ],
       [0.04195881, 0.76923077, 0.1791939 , ..., 0.        , 0.        ,
        0.        ],
       [0.06738839, 0.5       , 0.28676471, ..., 0.        , 0.        ,
        0.        ]])

In [67]:
X_train_processed_df = pd.DataFrame(preprocessor_ct.fit_transform(X_train,y_train), columns=preprocessor_ct.get_feature_names_out())
X_train_processed_df.head()

Unnamed: 0,num_pipeline__km_driven,num_pipeline__year,num_pipeline__max_power_bhp,num_pipeline__engine_cc,num_pipeline__mileage_kmpl,target_encoder__seats,ordinal_encoder__transmission,ordinal_encoder__owner,one_hot_encoder__fuel_CNG,one_hot_encoder__fuel_Diesel,...,one_hot_multiclass__brand_model_tata new,one_hot_multiclass__brand_model_tata nexon,one_hot_multiclass__brand_model_tata tiago,one_hot_multiclass__brand_model_tata zest,one_hot_multiclass__brand_model_toyota etios,one_hot_multiclass__brand_model_toyota fortuner,one_hot_multiclass__brand_model_toyota innova,one_hot_multiclass__brand_model_volkswagen polo,one_hot_multiclass__brand_model_volkswagen vento,one_hot_multiclass__brand_model_infrequent_sklearn
0,0.033482,0.769231,0.133987,0.133987,0.133987,486056.490818,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.01441,0.884615,0.180283,0.180283,0.180283,488992.095386,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.026701,0.730769,0.219553,0.219553,0.219553,482186.104205,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.010172,0.923077,0.112255,0.112255,0.112255,486361.845792,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.037721,0.730769,0.1122,0.1122,0.1122,486361.845792,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
X_test_processed_df = pd.DataFrame(preprocessor_ct.transform(X_test), columns=preprocessor_ct.get_feature_names_out())
X_test_processed_df.head()

Unnamed: 0,num_pipeline__km_driven,num_pipeline__year,num_pipeline__max_power_bhp,num_pipeline__engine_cc,num_pipeline__mileage_kmpl,target_encoder__seats,ordinal_encoder__transmission,ordinal_encoder__owner,one_hot_encoder__fuel_CNG,one_hot_encoder__fuel_Diesel,...,one_hot_multiclass__brand_model_tata new,one_hot_multiclass__brand_model_tata nexon,one_hot_multiclass__brand_model_tata tiago,one_hot_multiclass__brand_model_tata zest,one_hot_multiclass__brand_model_toyota etios,one_hot_multiclass__brand_model_toyota fortuner,one_hot_multiclass__brand_model_toyota innova,one_hot_multiclass__brand_model_volkswagen polo,one_hot_multiclass__brand_model_volkswagen vento,one_hot_multiclass__brand_model_infrequent_sklearn
0,0.065269,0.807692,0.356481,0.356481,0.356481,485230.322906,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.006095,0.884615,0.151688,0.151688,0.151688,485230.322906,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.004662,0.692308,0.086057,0.086057,0.086057,485230.322906,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.025006,0.923077,0.385893,0.385893,0.385893,801762.562801,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.028872,0.884615,0.373638,0.373638,0.373638,485230.322906,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 7.Finding the best model for the dataset by training a list of models

In [46]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error

def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    r2Score_train = r2_score(y_train, y_pred_train) 
    r2Score_test = r2_score(y_test, y_pred_test)
    root_mean_squared_error_train = root_mean_squared_error(y_train, y_pred_train) 
    root_mean_squared_error_test = root_mean_squared_error(y_test, y_pred_test)
    mean_absolute_error_train = mean_absolute_error(y_train, y_pred_train)  
    mean_absolute_error_test =  mean_absolute_error(y_test, y_pred_test)
    mean_squared_error_train = mean_squared_error(y_train, y_pred_train)
    mean_squared_error_test = mean_squared_error(y_test, y_pred_test)

    print(f"r2score ----> train: {r2Score_train} test: {r2Score_test}")
    print(f"rmse: ----> train: {root_mean_squared_error_train} test: {root_mean_squared_error_test}")
    print(f"mae: ----> train: {mean_absolute_error_train} test: {mean_absolute_error_test}")
    print(f"mse: ----> train: {mean_squared_error_train} test: {mean_squared_error_test}")


models_dict = {"Linear Regression": LinearRegression(),
               "Lasso": Lasso(),
               "Ridge": Ridge(),
               "K-Neihbors Regressor": KNeighborsRegressor(),
               "Decision Tree": DecisionTreeRegressor(),
               "Random Forest Regressor": RandomForestRegressor(),
               "XGBRegressor": XGBRegressor(),
               "Catboosting Regressor": CatBoostRegressor(verbose=False),
               "AdaBoost Regressor": AdaBoostRegressor()
               }

trained_models = []
for model_name,model in models_dict.items():
    model.fit(processed_df, y_train)
    trained_models.append(model)
    print("------------------------"+model_name+"------------------------")
    evaluate_model(model, X_train_processed_df, y_train, X_test_processed_df, y_test)
    print()


------------------------Linear Regression------------------------
r2score ----> train: 0.6757939349245148 test: -37800467.244538665
rmse: ----> train: 305376.3415167508 test: 2880032297.5228004
mae: ----> train: 155466.77267628151 test: 157265208.83978835
mse: ----> train: 93254709958.15521 test: 8.294586034774459e+18



  model = cd_fast.enet_coordinate_descent(


------------------------Lasso------------------------
r2score ----> train: 0.6757939014880354 test: 0.733109550132691
rmse: ----> train: 305376.3572640032 test: 241999.95496201314
mae: ----> train: 155456.8761563913 test: 147586.7491394973
mse: ----> train: 93254719575.83212 test: 58563978201.61639

------------------------Ridge------------------------
r2score ----> train: 0.6754756415061671 test: 0.7328357815948836
rmse: ----> train: 305526.2082915262 test: 242124.04144153843
mae: ----> train: 154572.14984649542 test: 146749.69263593483
mse: ----> train: 93346263952.99704 test: 58624051443.98382

------------------------K-Neihbors Regressor------------------------
r2score ----> train: 0.711883417336163 test: 0.5019692887887643
rmse: ----> train: 287878.3121761513 test: 330580.19425887003
mae: ----> train: 112818.34874371858 test: 146042.18035714285
mse: ----> train: 82873922621.38963 test: 109283264836.23225

------------------------Decision Tree------------------------
r2score ----> 

## 8. Hyperparameter tuning of the best model using RandomizedSearchCV

In [55]:
from sklearn.model_selection import RandomizedSearchCV

params = {'n_estimators':[5, 25, 50, 75, 100, 150, 200, 210, 220, 230, 240, 250, 300],
          'criterion':['absolute_error', 'friedman_mse', 'poisson','squared_error'],
          'max_depth':[None, 2, 4, 6, 8, 10],
          'min_samples_split':[2, 4, 8, 16],
          'min_samples_leaf':[1, 2, 4, 8, 16],
          'max_features':['sqrt', 'log2', None],
          'max_samples':[0.1, 0.2, 0.4, 0.8, 1.0],
          'warm_start':[True, False]}

rcv = RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=-1), param_distributions=params, n_iter=200, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True)
rcv.fit(X_train_processed_df, y_train)

## 9.Evaluating the finetuned model

In [56]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

rmse_train = root_mean_squared_error(y_train, rcv.predict(X_train_processed_df))
r2Score_train = r2_score(y_train, rcv.predict(X_train_processed_df))
rmse_train, r2Score_train

76132.6997662906

In [59]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

predicted_price = rcv.predict(X_test_processed_df)
rmse_test = root_mean_squared_error(y_test, predicted_price)
r2Score_test = r2_score(y_test, predicted_price)
rmse_test, r2Score_test

134010.80484087745

In [61]:
predicted_price_df = pd.DataFrame(predicted_price,index=y_test.index, columns=['predicted selling price'])
final_df = pd.concat([y_test.to_frame(), predicted_price_df],axis=1).sort_index()
final_df

Unnamed: 0,selling_price,predicted selling price
8,350000,286205.407429
15,400000,408878.614921
16,778000,708711.555556
18,150000,103887.680952
20,174000,211628.321937
...,...,...
8094,200000,300032.761905
8107,325000,465114.476772
8110,425000,429597.577587
8113,425000,566620.819397


In [62]:
rcv.best_params_

{'warm_start': False,
 'n_estimators': 75,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_samples': 1.0,
 'max_features': None,
 'max_depth': None,
 'criterion': 'squared_error'}

In [63]:
rcv_results_df = pd.DataFrame(rcv.cv_results_)
rcv_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_warm_start,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_samples,param_max_features,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.305079,0.029600,0.075832,0.001829,False,210,8,16,0.4,sqrt,...,-305130.364816,26638.424949,106,-307010.611876,-292750.604020,-300302.777889,-291176.483811,-304022.693126,-299052.634144,6186.163318
1,0.121001,0.010765,0.047905,0.004215,True,50,2,16,0.8,,...,-256547.714483,24765.648648,64,-241140.897213,-240467.041131,-249318.246261,-241074.424099,-245218.817678,-243443.885276,3389.342218
2,0.303657,0.037755,0.081269,0.009097,False,200,8,1,0.4,log2,...,-407220.578611,27148.561403,181,-401897.396720,-403531.517817,-408010.507628,-397667.263031,-404339.420445,-403089.221128,3370.334124
3,0.069831,0.015350,0.028818,0.008507,False,50,8,8,0.1,sqrt,...,-337901.858716,28568.084354,134,-342402.982978,-331242.135150,-333896.538641,-332046.320961,-336446.716450,-335206.938836,4018.617228
4,2.117937,0.043801,0.036769,0.011928,True,75,2,8,0.8,,...,-206322.951828,23404.129368,13,-191678.031089,-184374.555083,-200271.442345,-175730.611318,-194145.192787,-189239.966524,8460.186391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.225453,0.013393,0.057547,0.010480,True,210,8,8,1.0,log2,...,-408904.705240,27698.286559,185,-406347.048248,-408879.655216,-410714.579046,-404899.410475,-402991.081219,-406766.354841,2755.666999
196,0.239019,0.043254,0.042797,0.006564,False,150,16,16,1.0,,...,-232154.550389,25534.499623,37,-213108.788050,-205726.688680,-216578.071762,-214605.843648,-220348.466536,-214073.571735,4827.657876
197,0.323019,0.031411,0.061738,0.009135,True,250,8,1,0.8,sqrt,...,-254605.311634,24116.916272,59,-223784.168378,-213716.894067,-226849.622242,-210632.402884,-221454.571484,-219287.531811,6132.605884
198,0.334119,0.038645,0.070001,0.012664,True,250,2,1,0.1,sqrt,...,-273808.266642,29819.850921,82,-264145.299767,-252308.400072,-263722.013097,-244542.465482,-264478.448431,-257839.325370,8072.755422
