In [1]:
import pandas as pd

from IPython.display import display
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from keras.models import Sequential
from keras.layers import Dense, Dropout

seed = 42

In [2]:
data = pd.read_csv("Data/vehicles_preprocessed.csv", index_col=0)

display(data.head(15))

Unnamed: 0,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,price
27,2014,11,7315,1,2,2,57923.0,0,2,1,5,8,33590.0
28,2010,6,7437,1,2,2,71229.0,0,2,1,5,1,22590.0
29,2020,6,7449,1,2,2,19160.0,0,2,1,5,6,39590.0
30,2017,34,8609,1,2,2,41124.0,0,2,1,5,6,30990.0
31,2013,10,3695,0,1,2,128000.0,0,0,3,7,0,15000.0
32,2012,11,7362,1,2,2,68696.0,0,2,0,5,0,27990.0
33,2016,6,7455,1,1,2,29499.0,0,2,0,5,7,34590.0
34,2019,34,7994,0,1,2,43000.0,0,0,0,7,5,35000.0
35,2016,6,2346,1,1,2,17302.0,0,2,0,5,6,29990.0
39,2017,6,7482,1,1,2,40784.0,0,2,1,5,8,24590.0


In [3]:
def evaluate_model(y_expected, y_predicted):
    return r2_score(y_expected, y_predicted), mean_absolute_error(y_expected, y_predicted), mean_squared_error(y_expected, y_predicted), sqrt(mean_squared_error(y_expected, y_predicted))

x = data.iloc[:, :-1]
y = data.iloc[:, -1:]

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=seed)

display(pd.DataFrame(x_train[:15]))
display(pd.DataFrame(x_test[:15]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.260803,0.196313,0.500029,0.494224,-1.234329,2.002736,-0.532745,-0.224011,-0.47138,0.599768,0.68498,-1.450103
1,0.857795,-0.670323,-1.497838,2.005666,-1.234329,0.072319,-0.826198,-0.224011,-0.47138,0.599768,-1.444403,1.091317
2,-2.326159,1.737,0.744246,0.494224,-1.234329,0.072319,-0.275689,-0.224011,-0.47138,0.599768,-1.444403,-0.179393
3,1.454786,-0.959202,-0.441844,2.005666,1.360232,0.072319,-1.653198,-0.224011,-0.47138,1.463688,1.394774,1.091317
4,0.260803,0.100021,-0.898326,2.005666,-1.234329,0.072319,-0.69474,-0.224011,-0.47138,-1.128072,-1.444403,0.773639
5,-1.729168,0.100021,1.557924,0.494224,0.062952,0.072319,0.082225,-0.224011,0.925197,-1.128072,-1.444403,0.773639
6,0.260803,-0.188858,0.892983,0.494224,-1.234329,0.072319,-0.448228,-0.224011,2.321773,-1.128072,1.749671,0.138285
7,0.658798,1.737,1.242191,0.494224,0.062952,0.072319,0.041947,-0.224011,-0.47138,-1.128072,1.039877,0.138285
8,1.255789,0.196313,1.159263,-1.017217,-1.234329,0.072319,-0.362433,-0.224011,-0.47138,0.599768,-1.444403,0.455962
9,0.857795,-0.188858,1.126169,-1.017217,-1.234329,0.072319,-1.683767,-0.224011,-0.47138,0.599768,0.68498,-1.450103


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.729168,1.06295,1.67737,0.494224,0.062952,0.072319,1.099642,-0.224011,0.925197,-1.128072,-1.444403,1.091317
1,0.061806,-0.57403,-0.670085,-1.017217,1.360232,0.072319,-0.162896,-0.224011,-0.47138,1.463688,1.039877,1.091317
2,0.658798,1.351828,-1.783138,-1.017217,1.360232,0.072319,-0.402514,-0.224011,-0.47138,-0.264152,0.330083,0.138285
3,-0.336188,-0.766616,-0.000199,-1.017217,0.062952,0.072319,-0.242943,-0.224011,-0.47138,0.599768,-0.024815,-1.132425
4,0.658798,-0.477737,-1.371925,0.494224,0.062952,0.072319,0.526588,-0.224011,-0.47138,0.599768,-1.444403,0.455962
5,0.857795,-0.477737,1.001017,0.494224,1.360232,0.072319,-1.304077,-0.224011,2.321773,-1.128072,0.330083,-1.450103
6,-1.530171,1.737,1.482604,-1.017217,1.360232,0.072319,2.984895,-0.224011,-0.47138,-1.128072,0.330083,0.138285
7,0.857795,-0.477737,0.992267,0.494224,1.360232,0.072319,-1.27955,-0.224011,2.321773,-0.264152,0.330083,1.091317
8,-0.734182,-0.57403,1.225834,-1.017217,1.360232,-3.788517,-0.37843,-0.224011,-0.47138,-1.128072,0.330083,1.091317
9,0.061806,-1.536959,1.357452,0.494224,0.062952,0.072319,-0.161062,-0.224011,-0.47138,0.599768,0.68498,0.138285


In [4]:
model = LinearRegression()
model.fit(x_train, y_train)
y_predicted = model.predict(x_test)

results = [["Linear Regression"] + list(evaluate_model(y_test, y_predicted))]
linear_results = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])
display(linear_results)

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Linear Regression,0.747472,4463.225674,34527950.0,5876.048453


In [5]:
poly = PolynomialFeatures(degree=3)
x_train_poly = pd.DataFrame(poly.fit_transform(x_train))
x_test_poly = pd.DataFrame(poly.fit_transform(x_test))

model.fit(x_train_poly, y_train)
y_poly_predicted = model.predict(x_test_poly)

results = [["Polynomial Regression"] + list(evaluate_model(y_test, y_poly_predicted))]
polynomial_3_results = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])
display(polynomial_3_results)

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Polynomial Regression,0.851222,3195.768215,20342350.0,4510.249818


In [6]:
poly = PolynomialFeatures(degree=5)
x_train_poly = pd.DataFrame(poly.fit_transform(x_train))
x_test_poly = pd.DataFrame(poly.fit_transform(x_test))

ridge = Ridge(alpha=1500)
ridge.fit(x_train_poly, y_train)
y_poly_predicted = ridge.predict(x_test_poly)

results = [["Polynomial Regression (l2 regularization)"] + list(evaluate_model(y_test, y_poly_predicted))]
polynomial_5_results = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])
display(polynomial_5_results)

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Polynomial Regression (l2 regularization),0.880727,2681.562769,16308080.0,4038.326404


In [7]:
forest = RandomForestRegressor(n_estimators=100, random_state=seed)
forest.fit(x_train, y_train.to_numpy().ravel())
y_forest_predicted = forest.predict(x_test)

results = [["Random Forest Regression"] + list(evaluate_model(y_test, y_forest_predicted))]
forest_results = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])
display(forest_results)

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Random Forest Regression,0.941116,1495.928155,8051206.0,2837.464654


In [8]:
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(x_train, y_train)
y_knn_predicted = knn.predict(x_test)

results = [["K Nearest Neighbors Regression"] + list(evaluate_model(y_test, y_knn_predicted))]
knn_results = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])
display(knn_results)

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,K Nearest Neighbors Regression,0.900612,2027.600865,13589320.0,3686.369676


In [9]:
nn = Sequential()
nn.add(Dense(512, activation='relu', input_shape=(12,)))
nn.add(Dropout(0.2))
nn.add(Dense(128, activation="relu"))
nn.add(Dropout(0.2))
nn.add(Dense(256, activation="relu"))
nn.add(Dropout(0.2))
nn.add(Dense(64, activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(1, activation="linear"))
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               6656      
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               65664     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 256)               33024     
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 64)                1

In [10]:
nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["mean_absolute_error"])
nn.fit(x_train, y_train, epochs=100, batch_size=64)
y_nn_predicted = nn.predict(x_test)

results = [["Neural Network Regression"] + list(evaluate_model(y_test, y_nn_predicted))]
nn_results = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])
display(nn_results)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Neural Network Regression,0.894175,2555.494339,14469410.0,3803.867919


In [11]:
all_results = pd.concat([linear_results, polynomial_3_results, polynomial_5_results, forest_results, knn_results, nn_results]).reset_index(drop=True)
display(all_results)

Unnamed: 0,Model,R2 Score,MAE,MSE,RMSE
0,Linear Regression,0.747472,4463.225674,34527950.0,5876.048453
1,Polynomial Regression,0.851222,3195.768215,20342350.0,4510.249818
2,Polynomial Regression (l2 regularization),0.880727,2681.562769,16308080.0,4038.326404
3,Random Forest Regression,0.941116,1495.928155,8051206.0,2837.464654
4,K Nearest Neighbors Regression,0.900612,2027.600865,13589320.0,3686.369676
5,Neural Network Regression,0.894175,2555.494339,14469410.0,3803.867919
