In [75]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [76]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,manufacturer,year,age,mileage,engine,transmission,price
0,Mazda MX5,Mazda,2007,14,63131,Petrol,Manual,7499
1,Jaguar XF,Jaguar,2010,11,61890,Petrol,Automatic,7775
2,Audi A6,Audi,2012,9,129170,Diesel,Automatic,6950
3,Nissan Qashqai,Nissan,2013,8,44900,Petrol,Automatic,7790
4,MINI Mini,Mini,2017,4,32012,Petrol,Manual,15999


In [77]:
#drop the name and year columns because it is irrelevant in our model building
data = data.drop(['name', 'year'], axis=1)

In [78]:
data.head()

Unnamed: 0,manufacturer,age,mileage,engine,transmission,price
0,Mazda,14,63131,Petrol,Manual,7499
1,Jaguar,11,61890,Petrol,Automatic,7775
2,Audi,9,129170,Diesel,Automatic,6950
3,Nissan,8,44900,Petrol,Automatic,7790
4,Mini,4,32012,Petrol,Manual,15999


In [79]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,age,mileage,price,manufacturer_Abarth,manufacturer_Alfa-Romero,manufacturer_Audi,manufacturer_BMW,manufacturer_Bentley,manufacturer_Chevrolet,manufacturer_Chrysler,...,manufacturer_Volkswagen,manufacturer_Volvo,engine_Diesel,engine_Electric,engine_Hybrid,engine_Petrol,engine_Plug_in_hybrid,transmission_Automatic,transmission_Manual,transmission_Semiautomatic
0,14,63131,7499,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,11,61890,7775,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,9,129170,6950,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,8,44900,7790,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,4,32012,15999,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,1,10290,22000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2982,1,16193,27000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2983,4,59926,16000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2984,1,12355,30000,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [80]:
# creating X and y variables
X = data.drop('price', axis=1)
y = data.price

In [81]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

In [82]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2090, 50), (896, 50), (2090,), (896,))

In [83]:
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)

[[0.16666667 0.13713793 0.         ... 1.         0.         0.        ]
 [0.04166667 0.0308516  0.         ... 1.         0.         0.        ]
 [0.         0.03725164 0.         ... 0.         0.         1.        ]
 ...
 [0.41666667 0.28336733 0.         ... 1.         0.         0.        ]
 [0.20833333 0.22733844 0.         ... 0.         1.         0.        ]
 [0.375      0.16930954 0.         ... 1.         0.         0.        ]]


[[0.04166667 0.03048589 0.         ... 0.         1.         0.        ]
 [0.125      0.07322899 0.         ... 0.         0.         1.        ]
 [0.04166667 0.02873159 0.         ... 0.         0.         1.        ]
 ...
 [0.58333333 0.7828559  0.         ... 1.         0.         0.        ]
 [0.33333333 0.29713884 0.         ... 1.         0.         0.        ]
 [0.04166667 0.03975451 0.         ... 0.         1.         0.        ]]


In [84]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

In [85]:
#linear regression
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [86]:
#Ridge regression
rg = linear_model.Ridge(alpha=.5)
rg.fit(X_train, y_train)

Ridge(alpha=0.5)

In [87]:
#Random Forest Regressor
rf=  RandomForestRegressor(random_state=123)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=123)

In [88]:
#AdaBoost
ab =  AdaBoostRegressor(random_state=123)
ab.fit(X_train, y_train)

AdaBoostRegressor(random_state=123)

In [89]:
#Suport Vector Regressor
sv =  SVR()
sv.fit(X_train, y_train)

SVR()

In [90]:
#lasso regression
ls = linear_model.Lasso(alpha=0.1)
ls.fit(X_train, y_train)

Lasso(alpha=0.1)

In [91]:
#Bayesian Regression
br = linear_model.BayesianRidge()
br.fit(X_train, y_train)

BayesianRidge()

In [92]:
train_score = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'score': [lr.score(X_train, y_train), rg.score(X_train, y_train),
                rf.score(X_train, y_train) , ab.score(X_train, y_train),
                sv.score(X_train, y_train), ls.score(X_train, y_train),
                br.score(X_train, y_train)]
    })
train_score            

Unnamed: 0,model,score
0,Linear Regression,0.729342
1,Ridge Regression,0.728718
2,Random Forest,0.961247
3,AdaBoost Regressor,0.592583
4,Support Vector Regression,0.001647
5,Lasso regression,0.729339
6,Bayesian Regression,0.728988


In [93]:
lr_pred = lr.predict(X_test)
rg_pred = rg.predict(X_test)
rf_pred = rf.predict(X_test)
ab_pred = ab.predict(X_test)
sv_pred = sv.predict(X_test)
ls_pred = ls.predict(X_test)
br_pred = br.predict(X_test)

In [94]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'AdaBoost Regressor', 'Support Vector Regression',
              'Lasso regression','Bayesian Regression'],
    
    'mae': [mean_absolute_error(y_test, lr_pred), mean_absolute_error(y_test, rg_pred),
           mean_absolute_error(y_test, rf_pred), mean_absolute_error(y_test, ab_pred),
           mean_absolute_error(y_test, sv_pred), mean_absolute_error(y_test, ls_pred),
           mean_absolute_error(y_test, br_pred)],
    
    'mse': [mean_squared_error(y_test, lr_pred), mean_squared_error(y_test, rg_pred),
            mean_squared_error(y_test, rf_pred), mean_squared_error(y_test, ab_pred),
            mean_squared_error(y_test, sv_pred), mean_squared_error(y_test, ls_pred),
            mean_squared_error(y_test, br_pred)],
    
    'rmse': [(np.sqrt(metrics.mean_squared_error(y_test, lr_pred))),(np.sqrt(metrics.mean_squared_error(y_test, rg_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, rf_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ab_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, sv_pred))), (np.sqrt(metrics.mean_squared_error(y_test, ls_pred))),
             (np.sqrt(metrics.mean_squared_error(y_test, br_pred)))]
})
best_model

Unnamed: 0,model,mae,mse,rmse
0,Linear Regression,3277337000000.0,9.623881e+27,98101380000000.0
1,Ridge Regression,3011.715,18140810.0,4259.203
2,Random Forest,2790.605,14425080.0,3798.036
3,AdaBoost Regressor,3818.332,20794460.0,4560.095
4,Support Vector Regression,6005.912,49978040.0,7069.515
5,Lasso regression,3015.269,18455950.0,4296.039
6,Bayesian Regression,3012.065,18209150.0,4267.218


In [95]:
# Tune the model

rf = RandomForestRegressor(random_state=123)
rf_params = dict(
    n_estimators=np.arange(80, 162, 10), max_depth=[10, 12, 14, 16, 18, None]
                )
gs_rf = GridSearchCV(rf, rf_params, n_jobs=-1, scoring="neg_root_mean_squared_error", cv = cv)

# fit the model
gs_rf.fit(X_train, y_train)

gs_rf.best_params_

{'max_depth': 10, 'n_estimators': 160}

In [97]:
rf = RandomForestRegressor(max_depth=10, n_estimators=160, random_state=123)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [98]:
model = pd.DataFrame({
    'model': ['Random Forest'],
    'mae': [mean_absolute_error(y_test, rf_pred)],
    'mse': [mean_squared_error(y_test, rf_pred)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_test, rf_pred))]
    })
model

Unnamed: 0,model,mae,mse,rmse
0,Random Forest,2741.167733,13629840.0,3691.861621
