In [39]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics

In [20]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [21]:
#drop the name and year columns because it is irrelevant in our model building
data = data.drop(['name', 'year'], axis=1)

In [22]:
data.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [23]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,age,mileage,price,manufacturer_Abarth,manufacturer_Alfa-Romero,manufacturer_Audi,manufacturer_BMW,manufacturer_Bentley,manufacturer_Chevrolet,manufacturer_Chrysler,...,manufacturer_Volkswagen,manufacturer_Volvo,engine_Diesel,engine_Electric,engine_Hybrid,engine_Petrol,engine_Plug_in_hybrid,transmission_Automatic,transmission_Manual,transmission_Semiautomatic
0,14,63131,7499,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,11,61890,7775,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,9,129170,6950,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,8,44900,7790,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,4,32012,15999,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,1,10290,22000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2982,1,16193,27000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2983,4,59926,16000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2984,1,12355,30000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [24]:
# creating X and y variables
X = data.drop('price', axis=1)
y = data.price

In [25]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2090, 50), (896, 50), (2090,), (896,))

In [27]:
# feature scale the X_train and X_test values

scale = StandardScaler().fit(X_train)

# transform training data
X_train = scale.transform(X_train)

# transform testing data
X_test = scale.transform(X_test)

print(X_train)
print('\n')
print(X_test)

[[-0.21452177 -0.47546652 -0.05365705 ...  1.80942771 -1.3256153
  -0.38434494]
 [-1.16913606 -1.09127695 -0.05365705 ...  1.80942771 -1.3256153
  -0.38434494]
 [-1.48734082 -1.05419589 -0.05365705 ... -0.55266093 -1.3256153
   2.6018295 ]
 ...
 [ 1.69470679  0.37176945 -0.05365705 ...  1.80942771 -1.3256153
  -0.38434494]
 [ 0.10368299  0.04714465 -0.05365705 ... -0.55266093  0.75436667
  -0.38434494]
 [ 1.37650203 -0.28906798 -0.05365705 ...  1.80942771 -1.3256153
  -0.38434494]]


[[-1.16913606 -1.09339587 -0.05365705 ... -0.55266093  0.75436667
  -0.38434494]
 [-0.53272654 -0.84574737 -0.05365705 ... -0.55266093 -1.3256153
   2.6018295 ]
 [-1.16913606 -1.10356005 -0.05365705 ... -0.55266093 -1.3256153
   2.6018295 ]
 ...
 [ 2.96752584  3.26574741 -0.05365705 ...  1.80942771 -1.3256153
  -0.38434494]
 [ 1.05829727  0.45155994 -0.05365705 ...  1.80942771 -1.3256153
  -0.38434494]
 [-1.16913606 -1.03969455 -0.05365705 ... -0.55266093  0.75436667
  -0.38434494]]


In [28]:
#linear regression
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [29]:
#Ridge regression
rg = linear_model.Ridge(alpha=.5)
rg.fit(X_train, y_train)

Ridge(alpha=0.5)

In [30]:
#Random Forest Regressor
rf=  RandomForestRegressor(random_state=123)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=123)

In [31]:
#AdaBoost
ab =  AdaBoostRegressor(random_state=123)
ab.fit(X_train, y_train)

AdaBoostRegressor(random_state=123)

In [32]:
#Suport Vector Regressor
sv =  SVR()
sv.fit(X_train, y_train)

SVR()

In [33]:
#lasso regression
ls = linear_model.Lasso(alpha=0.1)
ls.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=0.1)

In [34]:
#Bayesian Regression
br = linear_model.BayesianRidge()
br.fit(X_train, y_train)

BayesianRidge()

In [35]:
train_score = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'score': [lr.score(X_train, y_train), rg.score(X_train, y_train),
                rf.score(X_train, y_train) , ab.score(X_train, y_train),
                sv.score(X_train, y_train), ls.score(X_train, y_train),
                br.score(X_train, y_train)]
    })
train_score            

Unnamed: 0,model,score
0,Linear Regression,0.729328
1,Ridge Regression,0.729342
2,Random Forest,0.961248
3,AdaBoost Regressor,0.592583
4,Support Vector Regression,0.002239
5,Lasso regression,0.729342
6,Bayesian Regression,0.729268


In [36]:
lr_pred = lr.predict(X_test)
rg_pred = rg.predict(X_test)
rf_pred = rf.predict(X_test)
ab_pred = ab.predict(X_test)
sv_pred = sv.predict(X_test)
ls_pred = ls.predict(X_test)
br_pred = br.predict(X_test)

In [40]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'mae': [round(mean_absolute_error(y_test, lr_pred), 4), round(mean_absolute_error(y_test, rg_pred), 4),
           round(mean_absolute_error(y_test, rf_pred), 4), round(mean_absolute_error(y_test, ab_pred), 4),
           round(mean_absolute_error(y_test, sv_pred), 4), round(mean_absolute_error(y_test, ls_pred), 4),
           round(mean_absolute_error(y_test, br_pred), 4)],
    
    'mse': [round(mean_squared_error(y_test, lr_pred), 4), round(mean_squared_error(y_test, rg_pred), 4),
            round(mean_squared_error(y_test, rf_pred), 4), round(mean_squared_error(y_test, ab_pred), 4),
            round(mean_squared_error(y_test, sv_pred), 4), round(mean_squared_error(y_test, ls_pred), 4),
            round(mean_squared_error(y_test, br_pred), 4)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, lr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, rg_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ab_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, sv_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ls_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, br_pred)))]
})
best_model

Unnamed: 0,model,mae,mse,rmse
0,Linear Regression,19019390000000.0,3.2411659999999995e+29,569312400000000.0
1,Ridge Regression,3014.139,18423580.0,4292.269
2,Random Forest,2789.11,14415250.0,3796.742
3,AdaBoost Regressor,3818.332,20794460.0,4560.095
4,Support Vector Regression,6005.804,49933890.0,7066.392
5,Lasso regression,3014.632,18446070.0,4294.889
6,Bayesian Regression,3013.941,18290220.0,4276.707
