In [73]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn.model_selection import ShuffleSplit

In [74]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,make,year,age,mileage,transmission,colour,price
0,2000 BUICK CENTURY CUSTOM Sedan 4 Door,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,2004 HONDA CIVIC LX,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,1993 FORD MUSTANG LX,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,1998 HONDA CR-V LX,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,1999 CHEVROLET TAHOE K1500 Wagon 4 Door,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [75]:
#drop the name column because it is irrelevant in our model building
data = data.drop('name', axis=1)

In [76]:
data.head()

Unnamed: 0,make,year,age,mileage,transmission,colour,price
0,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [77]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,year,age,mileage,price,make_ACURA,make_AUDI,make_BMW,make_BUICK,make_CADILLAC,make_CHEVROLET,...,colour_MAROON,colour_ORANGE,colour_PURPLE,colour_RED,colour_SILVER,colour_TAN,colour_TEAL,colour_TWO TONE,colour_WHITE,colour_YELLOW
0,2000,21,1.0,357,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004,17,134095.0,850,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1993,28,99086.0,975,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1998,23,194018.0,925,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1999,22,264054.0,750,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,2006,15,182744.0,450,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905,2006,15,239054.0,950,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
906,1997,24,220043.0,800,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
907,2002,19,0.0,1000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
# creating X and y variables
X = data.drop('price', axis=1)
y = data.price

In [79]:
#feature scaling
X = StandardScaler().fit_transform(X)
X

array([[-0.76791898,  0.76791898, -2.05885303, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [ 0.03052352, -0.03052352, -0.33473171, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [-2.16519335,  2.16519335, -0.78486194, ..., -0.03318617,
         2.43699226, -0.06648225],
       ...,
       [-1.36675085,  1.36675085,  0.77034965, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [-0.36869773,  0.36869773, -2.05886589, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [ 0.82896601, -0.82896601,  0.49587937, ..., -0.03318617,
        -0.41034189, -0.06648225]])

In [80]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

In [81]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((636, 58), (273, 58), (636,), (273,))

In [82]:
#linear regression
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [83]:
#Ridge regression
rg = linear_model.Ridge(alpha=.5)
rg.fit(X_train, y_train)

Ridge(alpha=0.5)

In [84]:
#Random Forest Regressor
rf=  RandomForestRegressor(random_state=123)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=123)

In [85]:
#AdaBoost
ab =  AdaBoostRegressor(random_state=123)
ab.fit(X_train, y_train)

AdaBoostRegressor(random_state=123)

In [86]:
#Suport Vector Regressor
sv =  SVR()
sv.fit(X_train, y_train)

SVR()

In [87]:
#lasso regression
ls = linear_model.Lasso(alpha=0.1)
ls.fit(X_train, y_train)

Lasso(alpha=0.1)

In [88]:
#Bayesian Regression
br = linear_model.BayesianRidge()
br.fit(X_train, y_train)

BayesianRidge()

In [89]:
lr_pred = lr.predict(X_test)
rg_pred = rg.predict(X_test)
rf_pred = rf.predict(X_test)
ab_pred = ab.predict(X_test)
sv_pred = sv.predict(X_test)
ls_pred = ls.predict(X_test)
br_pred = br.predict(X_test)

In [90]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'score': [round(lr.score(X_test,y_test) * 100, 4), round(rg.score(X_test,y_test) * 100, 4),
                round(rf.score(X_test,y_test) * 100, 4), round(ab.score(X_test,y_test) * 100, 4),
                round(sv.score(X_test,y_test) * 100, 4), round(ls.score(X_test,y_test) * 100, 4),
                round(br.score(X_test,y_test) * 100, 4)],
                   
    'mae': [round(mean_absolute_error(y_test, lr_pred), 4), round(mean_absolute_error(y_test, rg_pred), 4),
           round(mean_absolute_error(y_test, rf_pred), 4), round(mean_absolute_error(y_test, ab_pred), 4),
           round(mean_absolute_error(y_test, sv_pred), 4), round(mean_absolute_error(y_test, ls_pred), 4),
           round(mean_absolute_error(y_test, br_pred), 4)],
    
    'mse': [round(mean_squared_error(y_test, lr_pred), 4), round(mean_squared_error(y_test, rg_pred), 4),
            round(mean_squared_error(y_test, rf_pred), 4), round(mean_squared_error(y_test, ab_pred), 4),
            round(mean_squared_error(y_test, sv_pred), 4), round(mean_squared_error(y_test, ls_pred), 4),
            round(mean_squared_error(y_test, br_pred), 4)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, lr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, rg_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ab_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, sv_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ls_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, br_pred)))]
})
best_model

Unnamed: 0,model,score,mae,mse,rmse
0,Linear Regression,-6.507083e+23,794208900000.0,1.721996e+26,13122480000000.0
1,Ridge Regression,-7.3946,137.8155,28420.27,168.5831
2,Random Forest,-54.1233,167.7609,40786.27,201.9561
3,AdaBoost Regressor,-4.8774,140.5797,27754.15,166.5958
4,Support Vector Regression,0.0781,135.6833,26442.75,162.6123
5,Lasso regression,-7.2216,137.7469,28374.49,168.4473
6,Bayesian Regression,-0.511,136.7041,26598.64,163.0909
