In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('cardekho_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [11]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [12]:
df.shape

(15411, 13)

In [13]:
# null values check
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_name           15411 non-null  object 
 1   brand              15411 non-null  object 
 2   model              15411 non-null  object 
 3   vehicle_age        15411 non-null  int64  
 4   km_driven          15411 non-null  int64  
 5   seller_type        15411 non-null  object 
 6   fuel_type          15411 non-null  object 
 7   transmission_type  15411 non-null  object 
 8   mileage            15411 non-null  float64
 9   engine             15411 non-null  int64  
 10  max_power          15411 non-null  float64
 11  seats              15411 non-null  int64  
 12  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 1.5+ MB


In [15]:
df.describe()

Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,selling_price
count,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0
mean,6.036338,55616.48,19.701151,1486.057751,100.588254,5.325482,774971.1
std,3.013291,51618.55,4.171265,521.106696,42.972979,0.807628,894128.4
min,0.0,100.0,4.0,793.0,38.4,0.0,40000.0
25%,4.0,30000.0,17.0,1197.0,74.0,5.0,385000.0
50%,6.0,50000.0,19.67,1248.0,88.5,5.0,556000.0
75%,8.0,70000.0,22.7,1582.0,117.3,5.0,825000.0
max,29.0,3800000.0,33.54,6592.0,626.0,9.0,39500000.0


In [16]:
# We have model column so car_name and brand drop
df.drop(['car_name','brand'],axis=1,inplace=True)

In [17]:
df.columns

Index(['model', 'vehicle_age', 'km_driven', 'seller_type', 'fuel_type',
       'transmission_type', 'mileage', 'engine', 'max_power', 'seats',
       'selling_price'],
      dtype='object')

In [18]:
df.model.unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [19]:
# Getting Different features
# category features
cat_features = [feature for feature in df.columns if df[feature].dtype=='O']
print('Number of categorical features:',len(cat_features))
# numerical features
num_features = [feature for feature in df.columns if df[feature].dtype!='O']
print('Number of numerical features:',len(num_features))
# discrete Features
discrete_features = [feature for feature in num_features if len(df[feature].unique())<=25]
print('Number of discrete features:',len(discrete_features))
# continuos features
# one way 
# feature = [feature for feature in num_features if feature not in discrete_features]
conti_features = [feature for feature in df.columns 
                  if df[feature].dtype in ['int64','int32','float32','float64'] and len(df[feature].unique())>25]
print('Number of continuous features:',len(conti_features))

Number of categorical features: 4
Number of numerical features: 7
Number of discrete features: 2
Number of continuous features: 5


## Feature Engineering
### Encoding and scaling

In [21]:
# Splitting Data
X = df.drop('selling_price',axis=1)
y = df['selling_price']
X.head(), X.shape

(      model  vehicle_age  km_driven seller_type fuel_type transmission_type  \
 0      Alto            9     120000  Individual    Petrol            Manual   
 1     Grand            5      20000  Individual    Petrol            Manual   
 2       i20           11      60000  Individual    Petrol            Manual   
 3      Alto            9      37000  Individual    Petrol            Manual   
 4  Ecosport            6      30000      Dealer    Diesel            Manual   
 
    mileage  engine  max_power  seats  
 0    19.70     796      46.30      5  
 1    18.90    1197      82.00      5  
 2    17.00    1197      80.00      5  
 3    20.92     998      67.10      5  
 4    22.77    1498      98.59      5  ,
 (15411, 10))

In [22]:
# Using cloumn transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler

le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((11558, 10), (3853, 10), (11558,), (3853,))

In [24]:
X_train.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
11335,84,9,55324,Dealer,Petrol,Manual,17.92,1086,62.1,5
4885,71,6,99000,Dealer,Diesel,Automatic,19.3,1968,141.0,5
14692,73,5,49000,Dealer,Petrol,Manual,16.2,1199,74.0,5
12368,118,4,40000,Individual,Petrol,Manual,18.6,1197,81.83,5
7093,54,3,7245,Dealer,Petrol,Manual,18.9,1197,81.86,5


In [25]:
X_test.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
3334,117,12,73000,Dealer,Petrol,Manual,20.36,1197,78.9,5
10928,13,4,58000,Individual,Diesel,Manual,27.39,1248,74.0,5
2518,42,7,96000,Dealer,Diesel,Manual,20.77,1248,88.76,7
11322,25,1,4500,Dealer,Petrol,Automatic,18.4,1498,119.35,5
9394,7,11,62000,Dealer,Petrol,Manual,19.7,796,46.3,5


In [26]:
num_features

['vehicle_age',
 'km_driven',
 'mileage',
 'engine',
 'max_power',
 'seats',
 'selling_price']

In [27]:
encode_cols = ['seller_type','fuel_type','transmission_type']
scaler_cols =  ['vehicle_age','km_driven','mileage','engine','max_power','seats']

oh_transformer = OneHotEncoder(drop='first')
scaler = StandardScaler()

preprocessor = ColumnTransformer([
                                 ('OneHotEncoder',oh_transformer,encode_cols),
                                 ('StandardScaler',scaler,scaler_cols)], remainder='passthrough')
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Models

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [30]:
# function for model evaluation 
def model_evaluation(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2 = r2_score(true,predicted)
    return mae,mse,rmse,r2

In [31]:
# Testing diff. models
models = {'Linear Regression': LinearRegression(),
          'Lasso': Lasso(), 'Ridge': Ridge(),
          'KNN': KNeighborsRegressor(),
          'SVM': SVR(),
          'Decision Tree': DecisionTreeRegressor(),
          'Random Forest': RandomForestRegressor(),
          'Gradient Boosting': GradientBoostingRegressor(),
          'Ada Boost': AdaBoostRegressor(),
          'XG Boost': XGBRegressor()}

for name,model in models.items():
    model.fit(X_train_transformed,y_train)  # model training

    # Make Predictions
    y_train_pred = model.predict(X_train_transformed)
    y_test_pred = model.predict(X_test_transformed)

    # Evaluation
    train_mae,train_mse,train_rmse,train_r2 = model_evaluation(y_train,y_train_pred)
    test_mae,test_mse,test_rmse,test_r2 = model_evaluation(y_test,y_test_pred)

    print("Model Name:",name)
    print('Model performance for training set')
    print('Mean Absolute Error:',train_mae)
    print('Mean Squared Error:',train_mse)
    print('Root Mean Squared Error:',train_rmse)
    print('R-2 Score:',train_r2)

    print('-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-')

    print('Model performance for testing set')
    print('Mean Absolute Error:',test_mae)
    print('Mean Squared Error:',test_mse)
    print('Root Mean Squared Error:',test_rmse)
    print('R-2 Score:',test_r2)
    print("="*35)
    print('\n')

Model Name: Linear Regression
Model performance for training set
Mean Absolute Error: 266675.1075542438
Mean Squared Error: 304874315292.8461
Root Mean Squared Error: 552154.249547032
R-2 Score: 0.6219860307551311
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
Model performance for testing set
Mean Absolute Error: 284283.4459533757
Mean Squared Error: 270286925822.7506
Root Mean Squared Error: 519891.2634606881
R-2 Score: 0.6524693637784796


Model Name: Lasso
Model performance for training set
Mean Absolute Error: 266674.15319678007
Mean Squared Error: 304874327625.78357
Root Mean Squared Error: 552154.2607150502
R-2 Score: 0.6219860154635096
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
Model performance for testing set
Mean Absolute Error: 284283.913148972
Mean Squared Error: 270286329746.052
Root Mean Squared Error: 519890.69018982444
R-2 Score: 0.6524701302044311


Model Name: Ridge
Model performance for training set
Mean Absolute Error: 266635.9414703742
Mean Squared Error: 304875

In [53]:
# Hyperparameter tuning 

rf_params ={"max_depth": [5, 8, 15, None, 10], "max_features": [5, 7, "auto", 8],
            "min_samples_split": [2, 8, 15, 20], "n_estimators": [100, 200, 500, 1000]}

gradient_params={"loss": ['squared_error', 'huber', 'absolute_error'],"criterion": ['friedman_mse', 'squared_error', 'mse'],
                 "min_samples_split": [2, 8, 15, 20],"n_estimators": [100, 200, 500, 1000],
                 "max_depth": [5, 8, 15, None, 10],"learning_rate": [0.1, 0.01, 0.02,0.03]}

from sklearn.model_selection import RandomizedSearchCV


In [63]:
# models for Hyperparameter
randomcv_models = {'Random Forest': [RandomForestRegressor(),rf_params],
                   'Gradient Boosting': [GradientBoostingRegressor(),gradient_params]}

In [65]:
for name,(model,params) in randomcv_models.items():
    
    random_cv = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=100, cv=3, n_jobs=-1,verbose=3)
    random_cv.fit(X_train_transformed,y_train)# model training

    best = random_cv.best_params_

    print(f"-------------Best Parameters for {name}-------------")
    print(best)
   

Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------Best Parameters for Random Forest-------------
{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 5, 'max_depth': 15}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------Best Parameters for Gradient Boosting-------------
{'n_estimators': 500, 'min_samples_split': 8, 'max_depth': 10, 'loss': 'huber', 'learning_rate': 0.1, 'criterion': 'squared_error'}


In [67]:
# Training and Testing with best parameters
rf = RandomForestRegressor(max_depth = 15, max_features = 7, min_samples_split = 2, n_estimators = 200)
rf.fit(X_train_transformed,y_train)
y_pred = rf.predict(X_test_transformed)

test_mae,test_mse,test_rmse,test_r2 = model_evaluation(y_test,y_pred)

print('Random Forest performance for testing set')
print('Mean Absolute Error:',test_mae)
print('Mean Squared Error:',test_mse)
print('Root Mean Squared Error:',test_rmse)
print('R-2 Score:',test_r2)
print("="*35)

# Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=500, min_samples_split=8, max_depth=10,
                               loss='huber', learning_rate=0.1, criterion='squared_error')
gb.fit(X_train_transformed,y_train)
y_pred = gb.predict(X_test_transformed)

test_mae,test_mse,test_rmse,test_r2 = model_evaluation(y_test,y_pred)

print('Gradient Boost performance for testing set')
print('Mean Absolute Error:',test_mae)
print('Mean Squared Error:',test_mse)
print('Root Mean Squared Error:',test_rmse)
print('R-2 Score:',test_r2)
print("="*35)

Random Forest performance for testing set
Mean Absolute Error: 101060.866235027
Mean Squared Error: 54348843951.7566
Root Mean Squared Error: 233128.38512664346
R-2 Score: 0.9301191196763825
Gradient Boost performance for testing set
Mean Absolute Error: 102153.68100067398
Mean Squared Error: 53791104515.43317
Root Mean Squared Error: 231929.09372356278
R-2 Score: 0.9308362521849614


In [None]:
xgboost_params = {"learning_rate": [0.1, 0.01],"max_depth": [5, 10, 15, 20],
                  "n_estimators": [100, 200, 300],"colsample_bytree": [0.5, 0.8, 1, 0.4]}

random_cv = RandomizedSearchCV(estimator=XGBRegressor(), param_distributions=xgboost_params, n_iter=100, cv=3, n_jobs=-1,verbose=3)
random_cv.fit(X_train_transformed,y_train)# model training

best = random_cv.best_params_

print(f"-------------Best Parameters for XG Boost-------------")
print(best)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


In [81]:
## XG Boost

xgb = XGBRegressor(n_estimators=300, max_depth=5,
                                learning_rate=0.1, colsample_bytree=0.5)
xgb.fit(X_train_transformed,y_train)
y_pred = xgb.predict(X_test_transformed)

test_mae,test_mse,test_rmse,test_r2 = model_evaluation(y_test,y_pred)

print('Gradient Boost performance for testing set')
print('Mean Absolute Error:',test_mae)
print('Mean Squared Error:',test_mse)
print('Root Mean Squared Error:',test_rmse)
print('R-2 Score:',test_r2)
print("="*35)

Gradient Boost performance for testing set
Mean Absolute Error: 104792.51550334155
Mean Squared Error: 116001346644.0717
Root Mean Squared Error: 340589.70425435895
R-2 Score: 0.8508473038673401
