## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_log_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_data = pd.read_csv('used_cars_price_cleaned.csv')
data = raw_data.copy()
data = data.drop('Unnamed: 0', axis = 1)

data.head()

Unnamed: 0,B,C,D,E,F,J,M,S,front-wheel drive,part-time four-wheel drive,rear drive,electrocar,petrol,with damage,with mileage,make,priceUSD,year,mileage(kilometers),volume(cm3)
0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,31,565,1993,960015.0,2000.0
1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,23,5550,2008,172000.0,1400.0
2,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,56,8300,2008,223000.0,2500.0
3,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,84,3300,2005,140000.0,1200.0
4,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,56,2450,2002,413000.0,2200.0


## Splitting of dataset

In [3]:
from sklearn.model_selection import train_test_split

x = data.drop('priceUSD', axis = 1) #Independent variables
y = data['priceUSD'] #Dependent variables

#Training and test data split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1,random_state = 42)

#Training and validation data split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.1, random_state = 42)

## Scaling the data

In [4]:
#Since range of values for each feature is different, they need to be scaled with a fixed range
#MinMaxScaler scales features within a range of (0,1)

from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler() 
  
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test) 

**Since the Price of cars is not normally distributed, skewed to the left, hence it will be best to perfrom a log transformation on price.**

In [5]:
y_train_log = np.log(y_train)

y_val_log = np.log(y_val)

y_test_log = np.log(y_test)

######     

## Model creation, comparison and Evaluation

In this phase, we will use different Machine Learning algorithms to create models and compare the performance of those models.

We will use the Root Mean Squared Log Error(RMSLE) as our evaluation metric. The lower the RMSLE score, the better is the model.

The different ML algorithms to be implemented:

    1. Linear Regression

    2. Random Forest 

    3. Support Vector Machine
    
    4. K Nearest Neighbours
    
    5. XGBoost
    
    6. LightGBM
    
    7. CatBoost

At first, let's create a DataFrame that will store the validation results of all the models so that it can be compared later on.

In [6]:
Results = pd.DataFrame(columns = ['Model', 'Validation Score(Before tuning)', 'Accuracy(Before tuning)', 'Validation Score(After tuning)', 'Accuracy(After tuning)' ])

#### 1. Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

lr =  LinearRegression()

lr.fit(x_train_scaled, y_train_log) #Fitting or training the data

y_val_pred_log = lr.predict(x_val_scaled)#Predicting on validation set

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log)) #Evaluating model rmse score by comparing actual and predicted results

score = ((lr.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

RMSLE score:  0.058461182269852544
Accuracy score:  77.317


In [8]:
Results = Results.append({'Model' : 'Linear Regression', 'Validation Score(Before tuning)': rmsle, 
                          'Accuracy(Before tuning)': score }, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,


#####   

#### 2. Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf.fit(x_train_scaled, y_train_log) #Fitting or training the data

y_val_pred_log = rf.predict(x_val_scaled)#Predicting on validation set

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log)) #Evaluating model rmse score by comparing actual and predicted results
score = ((rf.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

RMSLE score:  0.03782334163298192
Accuracy score:  89.921


In [10]:
Results = Results.append({'Model' : 'Random Forest', 'Validation Score(Before tuning)': rmsle,
                         'Accuracy(Before tuning)': score}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,
1,Random Forest,0.037823,89.921,,


#####  

#### 3. K Nearest Neighbours

In [11]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn.fit(x_train_scaled, y_train_log) #Fitting or training the data

y_val_pred_log = knn.predict(x_val_scaled)#Predicting on validation set

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log)) #Evaluating model rmse score by comparing actual and predicted results
score = ((knn.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

RMSLE score:  0.03823251764652535
Accuracy score:  89.49


In [12]:
Results = Results.append({'Model' : 'K Nearest Neighbours', 'Validation Score(Before tuning)': rmsle,
                         'Accuracy(Before tuning)': score}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,
1,Random Forest,0.037823,89.921,,
2,K Nearest Neighbours,0.038233,89.49,,


######   

#### 4. Support Vector machine

In [13]:
from sklearn.svm import SVR

svr = SVR()

svr.fit(x_train_scaled, y_train_log) #Fitting or training the data

y_val_pred_log = svr.predict(x_val_scaled)#Predicting on validation set

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log)) #Evaluating model rmse score by comparing actual and predicted results
score = ((svr.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

RMSLE score:  0.047935884858714424
Accuracy score:  83.838


In [14]:
Results = Results.append({'Model' : 'Support Vector Machine', 'Validation Score(Before tuning)': rmsle,
                         'Accuracy(Before tuning)': score}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,
1,Random Forest,0.037823,89.921,,
2,K Nearest Neighbours,0.038233,89.49,,
3,Support Vector Machine,0.047936,83.838,,


###### 

#### 5. XGBoost

In [15]:
from xgboost import XGBRegressor

xgb = XGBRegressor()

xgb.fit(x_train_scaled, y_train_log) #Fitting or training the data

y_val_pred_log = xgb.predict(x_val_scaled)#Predicting on validation set

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log)) #Evaluating model rmse score by comparing actual and predicted results
score = ((xgb.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

RMSLE score:  0.04095935448169442
Accuracy score:  87.925


In [16]:
Results = Results.append({'Model' : 'XGBoost', 'Validation Score(Before tuning)': rmsle,
                         'Accuracy(Before tuning)': score}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,
1,Random Forest,0.037823,89.921,,
2,K Nearest Neighbours,0.038233,89.49,,
3,Support Vector Machine,0.047936,83.838,,
4,XGBoost,0.040959,87.925,,


#####    

#### 6. Light GBM (LGBM)

In [17]:
from lightgbm import LGBMRegressor

lgbr = LGBMRegressor()

lgbr.fit(x_train_scaled, y_train_log) #Fitting or training the data
 
y_val_pred_log = lgbr.predict(x_val_scaled)#Predicting on validation set

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log)) #Evaluating model rmse score by comparing actual and predicted results
score = ((lgbr.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

RMSLE score:  0.036177417400881436
Accuracy score:  90.781


In [18]:
Results = Results.append({'Model' : 'Light GBM', 'Validation Score(Before tuning)': rmsle,
                         'Accuracy(Before tuning)': score}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,
1,Random Forest,0.037823,89.921,,
2,K Nearest Neighbours,0.038233,89.49,,
3,Support Vector Machine,0.047936,83.838,,
4,XGBoost,0.040959,87.925,,
5,Light GBM,0.036177,90.781,,


#####   

#### 7. CatBoost

In [19]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor()

cb.fit(x_train_scaled, y_train_log, verbose = 200)

y_val_pred_log = cb.predict(x_val_scaled)

rmsle = np.sqrt(mean_squared_log_error(y_val_log, y_val_pred_log))
score = ((cb.score(x_val_scaled,y_val_log))*100).round(3)

print("RMSLE score: ", rmsle)

print("Accuracy score: ", score)

Learning rate set to 0.07236
0:	learn: 0.9634298	total: 154ms	remaining: 2m 33s
200:	learn: 0.3110477	total: 1.66s	remaining: 6.59s
400:	learn: 0.2879327	total: 3.26s	remaining: 4.87s
600:	learn: 0.2771087	total: 4.8s	remaining: 3.18s
800:	learn: 0.2698537	total: 6.25s	remaining: 1.55s
999:	learn: 0.2637301	total: 7.77s	remaining: 0us
RMSLE score:  0.03534135196770207
Accuracy score:  91.283


In [20]:
Results = Results.append({'Model' : 'CatBoost', 'Validation Score(Before tuning)': rmsle,
                         'Accuracy(Before tuning)': score}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Accuracy(Before tuning),Validation Score(After tuning),Accuracy(After tuning)
0,Linear Regression,0.058461,77.317,,
1,Random Forest,0.037823,89.921,,
2,K Nearest Neighbours,0.038233,89.49,,
3,Support Vector Machine,0.047936,83.838,,
4,XGBoost,0.040959,87.925,,
5,Light GBM,0.036177,90.781,,
6,CatBoost,0.035341,91.283,,
