In [1]:
### importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

In [2]:
### dataset

df = pd.read_csv('cardekho.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [4]:
## drop columns

df.drop(columns=['Unnamed: 0','car_name','brand'],axis=1,inplace=True)

## info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              15411 non-null  object 
 1   vehicle_age        15411 non-null  int64  
 2   km_driven          15411 non-null  int64  
 3   seller_type        15411 non-null  object 
 4   fuel_type          15411 non-null  object 
 5   transmission_type  15411 non-null  object 
 6   mileage            15411 non-null  float64
 7   engine             15411 non-null  int64  
 8   max_power          15411 non-null  float64
 9   seats              15411 non-null  int64  
 10  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.3+ MB


In [5]:
## check for null values

df.isnull().sum()

model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [6]:
### independent and dependent column

X = df.drop(columns=['selling_price'],axis=1)
y = df['selling_price']

In [7]:
## converting model column

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

X['model'] = label.fit_transform(X['model'])

X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [13]:
### Column transformer

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

one_features = ['seller_type','fuel_type','transmission_type']
scaler_features = X.select_dtypes(exclude='object').columns

encoder = OneHotEncoder(drop='first',sparse_output=False)
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    [('onehotencoder',encoder, one_features),
    ('scaler',scaler,scaler_features)],
    remainder='passthrough'
)

preprocessor

0,1,2
,transformers,"[('onehotencoder', ...), ('scaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [14]:
X = preprocessor.fit_transform(X)
X

array([[ 1.        ,  0.        ,  0.        , ..., -1.32425883,
        -1.26335238, -0.40302241],
       [ 1.        ,  0.        ,  0.        , ..., -0.55471774,
        -0.43257082, -0.40302241],
       [ 1.        ,  0.        ,  0.        , ..., -0.55471774,
        -0.47911321, -0.40302241],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.02291783,
         0.06822523, -0.40302241],
       [ 0.        ,  0.        ,  1.        , ...,  1.32979434,
         0.91715831,  2.07344426],
       [ 0.        ,  0.        ,  0.        , ...,  0.02099878,
         0.39588361, -0.40302241]], shape=(15411, 14))

In [15]:
## evaluation function

def metrics(true, predicted):
    r2 = r2_score(true, predicted)
    rmse = root_mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)

    return r2, rmse, mae

In [16]:
## train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
## model training

models = {
    'linear regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNR': KNeighborsRegressor(),
    'DT': DecisionTreeRegressor(),
    'RF': RandomForestRegressor(),
    'AB': AdaBoostRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    ## model prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    ## train model metrics
    train_r2, train_rmse, train_mae = metrics(y_train, y_train_pred)
    test_r2, test_rmse, test_mae = metrics(y_test, y_test_pred)

    print(list(models.keys())[i])

    print("model performance of training set")
    print("RMSE for train: ", train_rmse)
    print("MAE for train: ", train_mae)
    print("R2 for train: ", train_r2)

    print("="*35)

    print("model performance of testing set")
    print("RMSE for test: ", test_rmse)
    print("MAE for test: ", test_mae)
    print("R2 for test: ", test_r2)

    print("="*35)
    print("\n")


linear regression
model performance of training set
RMSE for train:  553855.6665411663
MAE for train:  268101.6070829936
R2 for train:  0.6217719576765959
model performance of testing set
RMSE for test:  502543.59302309836
MAE for test:  279618.5794158427
R2 for test:  0.6645109298852006


Ridge
model performance of training set
RMSE for train:  553856.3159709624
MAE for train:  268059.8014688311
R2 for train:  0.6217710706848424
model performance of testing set
RMSE for test:  502533.8229890288
MAE for test:  279557.2168930274
R2 for test:  0.6645239743566811


Lasso
model performance of training set
RMSE for train:  553855.6709544231
MAE for train:  268099.22264981153
R2 for train:  0.6217719516489697
model performance of testing set
RMSE for test:  502542.66963789385
MAE for test:  279614.7461034126
R2 for test:  0.6645121627547996


KNR
model performance of training set
RMSE for train:  325873.02669334516
MAE for train:  91425.63270603504
R2 for train:  0.8690645193508602
model per

In [18]:
## hyperparameter tuning

## hyperparameter tuning for RF

rf_params = {
    'max_depth':[5,8,15,None,10],
    'max_features':[5,7,'auto',10],
    'min_samples_split':[2,8,15,20],
    'n_estimators':[100,200,500,1000]
}

## hyperparameter for Ada Boost
ab_params = {
    'n_estimators':[50,60,70],
    'loss':['linear','square','exponential']
}

### model list for hyper parameter tuning

randomcv_models = [('AB', AdaBoostRegressor(),ab_params),
                   ('RF',RandomForestRegressor(),rf_params)
                   ]

from sklearn.model_selection import RandomizedSearchCV

model_param = {}

for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator = model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name]=random.best_params_

for model_name in model_param:
    print(f"Best parameters for {model_name}:")
    print(model_param[model_name])



Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits


72 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "d:\KNMaterial\my_python\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\KNMaterial\my_python\venv\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "d:\KNMaterial\my_python\venv\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "d:\KNMaterial\my_python\venv\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_constraints
    raise

Best parameters for AB:
{'n_estimators': 60, 'loss': 'linear'}
Best parameters for RF:
{'n_estimators': 500, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}


In [19]:
models ={
    "Random Forest": RandomForestRegressor(n_estimators=500, min_samples_split= 2, max_features= 7, max_depth= None),
    "AdaBoost Regressor": AdaBoostRegressor(n_estimators= 60, loss= 'linear')
}

for i in range(len(models)):
    ## take the ith model
    model = list(models.values())[i]

    ## fit the model
    model.fit(X_train, y_train)

    ## predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    ## train model metrics
    train_r2, train_rmse, train_mae = metrics(y_train, y_train_pred)
    test_r2, test_rmse, test_mae = metrics(y_test, y_test_pred)

    print(list(models.keys())[i])

    print("model performance of training set")
    print("RMSE for train: ", train_rmse)
    print("MAE for train: ", train_mae)
    print("R2 for train: ", train_r2)

    print("="*35)

    print("model performance of testing set")
    print("RMSE for test: ", test_rmse)
    print("MAE for test: ", test_mae)
    print("R2 for test: ", test_r2)

    print("="*35)
    print("\n")

Random Forest
model performance of training set
RMSE for train:  128177.1809718806
MAE for train:  38973.68534365672
R2 for train:  0.9797426818112129
model performance of testing set
RMSE for test:  212881.29910913727
MAE for test:  98214.35561227794
R2 for test:  0.9397986932158598


AdaBoost Regressor
model performance of training set
RMSE for train:  453726.4118513868
MAE for train:  334809.3615565197
R2 for train:  0.7461666745142237
model performance of testing set
RMSE for test:  488863.6954226978
MAE for test:  353286.56554584403
R2 for test:  0.6825272400001459


