In [39]:
# import the relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import ShuffleSplit

In [31]:
data = pd.read_csv('cleaned_car_data.csv')
data.head()

Unnamed: 0,name,make,year,age,mileage,transmission,colour,price
0,2000 BUICK CENTURY CUSTOM Sedan 4 Door,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,2004 HONDA CIVIC LX,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,1993 FORD MUSTANG LX,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,1998 HONDA CR-V LX,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,1999 CHEVROLET TAHOE K1500 Wagon 4 Door,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [32]:
#drop the name column because it is irrelevant in our model building
data = data.drop('name', axis=1)

In [33]:
data.head()

Unnamed: 0,make,year,age,mileage,transmission,colour,price
0,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [34]:
# get dummny data
data = pd.get_dummies(data)
data

Unnamed: 0,year,age,mileage,price,make_ACURA,make_AUDI,make_BMW,make_BUICK,make_CADILLAC,make_CHEVROLET,...,colour_MAROON,colour_ORANGE,colour_PURPLE,colour_RED,colour_SILVER,colour_TAN,colour_TEAL,colour_TWO TONE,colour_WHITE,colour_YELLOW
0,2000,21,1.0,357,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004,17,134095.0,850,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1993,28,99086.0,975,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1998,23,194018.0,925,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1999,22,264054.0,750,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,2006,15,182744.0,450,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
905,2006,15,239054.0,950,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
906,1997,24,220043.0,800,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
907,2002,19,0.0,1000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# creating X and y variables
X = data.drop('price', axis=1)
y = data.price

In [37]:
#feature scaling
X = StandardScaler().fit_transform(X)
X

array([[-0.76791898,  0.76791898, -2.05885303, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [ 0.03052352, -0.03052352, -0.33473171, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [-2.16519335,  2.16519335, -0.78486194, ..., -0.03318617,
         2.43699226, -0.06648225],
       ...,
       [-1.36675085,  1.36675085,  0.77034965, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [-0.36869773,  0.36869773, -2.05886589, ..., -0.03318617,
        -0.41034189, -0.06648225],
       [ 0.82896601, -0.82896601,  0.49587937, ..., -0.03318617,
        -0.41034189, -0.06648225]])

In [40]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

In [41]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((636, 58), (273, 58), (636,), (273,))

In [45]:
models = {
    "linear_model": LinearRegression(),
    "ridge_model": Ridge(random_state=123),
    "rf_model": RandomForestRegressor(random_state=123),
    "ada_boost": AdaBoostRegressor(random_state=123),
    "svr": SVR(),
}

In [43]:

def train_model(models: dict) -> pd.DataFrame:
    my_dict = {}
    name_list, r_sqd_list, mae_list, rmse_list = [], [], [], []
    for name, estimator in models.items():
        # fit
        estimator.fit(X_train, y_train)

        # make predictions
        y_pred = estimator.predict(X_test)

        # metrics
        r_sqd = metrics.r2_score(y_test, y_pred)
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        # add the metrics to the empty list
        name_list.append(name)
        r_sqd_list.append(r_sqd)
        mae_list.append(mae)
        rmse_list.append(rmse)

    my_dict["Name"] = name_list
    my_dict["R_squared"] = r_sqd_list
    my_dict["Mean_absolute_error"] = mae_list
    my_dict["Root_mean_sqd_error"] = rmse_list

    models = pd.DataFrame(my_dict)
    models = models.sort_values("Root_mean_sqd_error")
    return models

In [44]:
train_model(models)

Unnamed: 0,Name,R_squared,Mean_absolute_error,Root_mean_sqd_error
4,svr,0.0007807041,135.6833,162.6123
3,ada_boost,-0.04877445,140.5797,166.5958
1,ridge_model,-0.07378931,137.808,168.5708
2,rf_model,-0.5412328,167.7609,201.9561
0,lin_model,-3.025308e+24,17124830000000.0,282948700000000.0
