In [60]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import MinMaxScaler

In [61]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,make,year,age,mileage,transmission,colour,price
0,2000 BUICK CENTURY CUSTOM Sedan 4 Door,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,2004 HONDA CIVIC LX,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,1993 FORD MUSTANG LX,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,1998 HONDA CR-V LX,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,1999 CHEVROLET TAHOE K1500 Wagon 4 Door,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [62]:
#drop the name column because it is irrelevant in our model building
data = data.drop('name', axis=1)

In [63]:
data.head()

Unnamed: 0,make,year,age,mileage,transmission,colour,price
0,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [64]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,year,age,mileage,price,make_ACURA,make_AUDI,make_BMW,make_BUICK,make_CADILLAC,make_CHEVROLET,...,colour_MAROON,colour_ORANGE,colour_PURPLE,colour_RED,colour_SILVER,colour_TAN,colour_TEAL,colour_TWO TONE,colour_WHITE,colour_YELLOW
0,2000,21,1.0,357,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004,17,134095.0,850,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1993,28,99086.0,975,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1998,23,194018.0,925,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1999,22,264054.0,750,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,2006,15,182744.0,450,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905,2006,15,239054.0,950,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
906,1997,24,220043.0,800,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
907,2002,19,0.0,1000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
# creating X and y variables
X = data.drop(columns = ['price', 'year'], axis=1)
y = data.price

In [66]:
#feature scaling
X = MinMaxScaler().fit_transform(X)
X

array([[4.47368421e-01, 2.40493878e-06, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.42105263e-01, 3.22490266e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.31578947e-01, 2.38295764e-01, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [5.26315789e-01, 5.29189944e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.94736842e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.36842105e-01, 4.77851716e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [67]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

In [68]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((636, 57), (273, 57), (636,), (273,))

In [69]:
#linear regression
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [70]:
#Ridge regression
rg = linear_model.Ridge(alpha=.5)
rg.fit(X_train, y_train)

Ridge(alpha=0.5)

In [71]:
#Random Forest Regressor
rf=  RandomForestRegressor(random_state=123)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=123)

In [72]:
#AdaBoost
ab =  AdaBoostRegressor(random_state=123)
ab.fit(X_train, y_train)

AdaBoostRegressor(random_state=123)

In [73]:
#Suport Vector Regressor
sv =  SVR()
sv.fit(X_train, y_train)

SVR()

In [74]:
#lasso regression
ls = linear_model.Lasso(alpha=0.1)
ls.fit(X_train, y_train)

Lasso(alpha=0.1)

In [75]:
#Bayesian Regression
br = linear_model.BayesianRidge()
br.fit(X_train, y_train)

BayesianRidge()

In [77]:
train_score = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'score': [lr.score(X_train, y_train), rg.score(X_train, y_train),
                rf.score(X_train, y_train) , ab.score(X_train, y_train),
                sv.score(X_train, y_train), ls.score(X_train, y_train),
                br.score(X_train, y_train)]
    })
train_score            

Unnamed: 0,model,score
0,Linear Regression,0.112072
1,Ridge Regression,0.108699
2,Random Forest,0.596391
3,AdaBoost Regressor,0.018049
4,Support Vector Regression,-0.001643
5,Lasso regression,0.108379
6,Bayesian Regression,0.038186


In [78]:
lr_pred = lr.predict(X_test)
rg_pred = rg.predict(X_test)
rf_pred = rf.predict(X_test)
ab_pred = ab.predict(X_test)
sv_pred = sv.predict(X_test)
ls_pred = ls.predict(X_test)
br_pred = br.predict(X_test)

In [79]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'score': [round(lr.score(y_test, lr_pred) * 100, 4), round(rg.score(y_test, rg_pred) * 100, 4),
                round(rf.score(y_test, rf_pred) * 100, 4), round(ab.score(y_test, rg_pred) * 100, 4),
                round(sv.score(y_test, sv_pred) * 100, 4), round(ls.score(y_test, ls_pred) * 100, 4),
                round(br.score(y_test, br_pred) * 100, 4)],
                   
    'mae': [round(mean_absolute_error(y_test, lr_pred), 4), round(mean_absolute_error(y_test, rg_pred), 4),
           round(mean_absolute_error(y_test, rf_pred), 4), round(mean_absolute_error(y_test, ab_pred), 4),
           round(mean_absolute_error(y_test, sv_pred), 4), round(mean_absolute_error(y_test, ls_pred), 4),
           round(mean_absolute_error(y_test, br_pred), 4)],
    
    'mse': [round(mean_squared_error(y_test, lr_pred), 4), round(mean_squared_error(y_test, rg_pred), 4),
            round(mean_squared_error(y_test, rf_pred), 4), round(mean_squared_error(y_test, ab_pred), 4),
            round(mean_squared_error(y_test, sv_pred), 4), round(mean_squared_error(y_test, ls_pred), 4),
            round(mean_squared_error(y_test, br_pred), 4)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, lr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, rg_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ab_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, sv_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ls_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, br_pred)))]
})
best_model

ValueError: Expected 2D array, got 1D array instead:
array=[ 800  625  550  525  950  950  900  650  450  550  600 1000  725  700
  800  900  600  675  625  900 1000  950  600  875  725  700  675  850
  950  850  975  300  725  725  950  850  550 1000  825  850  700  875
  750  850  700  675  850  950  800  700  950  800  700  975 1000  900
  999  900  700  800  850  850  900  911  750 1000  675  975  450  400
  900  975  750  825  925  525  550  800  925  700  800  850  650  750
  450  600  900  800  800  800  700  800  425  850  525  875  925  550
  950 1000  925  775  875 1000  650  750  750  925  850  525 1000 1000
  950  850  900  800 1000  525  900  950  425  800  975  625  425  675
  925  775  950  525  600  600  950  950  900  550  700  925 1000  550
  799  550  775  650  800  950  700  450  550  900  550  750  700  750
  625  600  875  975  900  750  725  550  600  350  900  675  950  650
 1000  950  890  750 1000  900  599  600  650  800  950  525  925  925
  850  750 1000  825  450  725  850  750  750  650  850  725  675  700
  975  725  500  950 1000  775  675  725  925  975  700  850  850  975
  525  775  850  800  800  950  525  700  550  750  900  875  700  525
  725  725  850 1000  975  650  650  825  950  800  525  725  850  350
  425  975  900 1000  800  650  950  800  750  975  600  675  900  725
  600  675  800  775  975  600  800  900  800  600 1000  900  800  675
  975  400  925  975  900  800  575].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.