In [None]:
# Importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


**Step 1: Load and Inspect Data**

In [None]:
# Loading the dataset

data = pd.read_csv("price_prediction.csv")
data

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


**Step 2: Data Cleaning**

In [None]:
# Now we shall perform few steps to analyse and clean our data

data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [None]:
# Finding the number of null values in the data
data.isnull().sum()

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

It can be seen that features such as `Engine Fuel Type`, `Engine HP`, `Engine cylinders`, `Market Category` have null values present in there respective columns.

In [None]:
# Now we shall drop the null values to clean the data and improve models performance.

data = data.dropna()

# Checking again if null values are removed from the data
data.isnull().sum()

Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Market Category      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64

In [None]:
# Now we Convert categorical variables to numerical (using get_dummies for simplicity)

categorical_features = ['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style']
data = pd.get_dummies(data, columns = categorical_features, drop_first=True)

In [None]:
# Display the first few rows of the cleaned dataset
data.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,MSRP,Make_Alfa Romeo,Make_Aston Martin,...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,2011,335.0,6.0,2.0,26,19,3916,46135,False,False,...,False,False,True,False,False,False,False,False,False,False
1,2011,300.0,6.0,2.0,28,19,3916,40650,False,False,...,True,False,False,False,False,False,False,False,False,False
2,2011,300.0,6.0,2.0,28,20,3916,36350,False,False,...,False,False,True,False,False,False,False,False,False,False
3,2011,230.0,6.0,2.0,28,18,3916,29450,False,False,...,False,False,True,False,False,False,False,False,False,False
4,2011,230.0,6.0,2.0,28,18,3916,34500,False,False,...,True,False,False,False,False,False,False,False,False,False


**Step 3: Feature Selection**

In [None]:
# we retain the relevant features and remove less significant ones.

selected_features = ['Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors',
    'highway MPG', 'city mpg', 'Popularity'] + [col for col in data.columns if col.startswith(tuple(categorical_features))]

In [None]:
# now we store the relevant features in variable x and Target variable in variable y

x = data[selected_features]
y = data['MSRP']

# Displaying selected features
x.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,Make_Alfa Romeo,Make_Aston Martin,Make_Audi,...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
0,2011,335.0,6.0,2.0,26,19,3916,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,2011,300.0,6.0,2.0,28,19,3916,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,2011,300.0,6.0,2.0,28,20,3916,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,2011,230.0,6.0,2.0,28,18,3916,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,2011,230.0,6.0,2.0,28,18,3916,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [None]:
# now we create a function that will compare performances of each model

def regression_evaluation(y, yhat):
    ei = y - yhat
    ei2 = ei ** 2
    MSE = ei2.mean()
    RMSE = MSE ** 0.5
    ei_abs = ei.abs()
    MAE = ei_abs.mean()
    SSE = ei2.sum()
    SST = ((y - y.mean()) ** 2).sum()
    R2 = 1 - (SSE / SST)

    print("MSE: ", MSE)
    print("RMSE: ", RMSE)
    print("MAE: ", MAE)
    print("R^2: ", R2)

**Step 4: Split Data into Training and Testing Sets**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # Test size is set to 20%
x_train

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,Make_Alfa Romeo,Make_Aston Martin,Make_Audi,...,Vehicle Style_Convertible,Vehicle Style_Convertible SUV,Vehicle Style_Coupe,Vehicle Style_Crew Cab Pickup,Vehicle Style_Extended Cab Pickup,Vehicle Style_Passenger Minivan,Vehicle Style_Passenger Van,Vehicle Style_Regular Cab Pickup,Vehicle Style_Sedan,Vehicle Style_Wagon
9339,2015,285.0,6.0,2.0,22,17,549,False,False,False,...,False,False,False,False,False,False,False,True,False,False
11774,2016,304.0,6.0,4.0,28,18,1624,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4430,2014,310.0,8.0,3.0,17,13,1385,False,False,False,...,False,False,False,False,False,False,True,False,False,False
418,2016,300.0,6.0,4.0,30,20,3916,False,False,False,...,False,False,False,False,False,False,False,False,True,False
5294,2016,240.0,6.0,4.0,26,19,617,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7479,2016,280.0,6.0,4.0,28,20,873,False,False,False,...,False,False,False,False,False,False,False,False,True,False
7736,2001,253.0,6.0,2.0,21,15,1013,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1003,1995,181.0,6.0,4.0,23,15,870,False,False,False,...,False,False,False,False,False,False,False,False,True,False
11243,2016,201.0,4.0,3.0,33,25,1439,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
y_train

9339     30910
11774    45295
4430     34525
418      49950
5294     63600
         ...  
7479     36835
7736     44625
1003      2000
11243    22600
10753    31205
Name: MSRP, Length: 6467, dtype: int64

**Step 5: Apply and Compare Regression Models**

In [None]:
# now we Initialize and train the models

# Linear Regression
model_1 = LinearRegression()
model_1.fit(x_train, y_train)

In [None]:
# KNN Regressor

model_2 = KNeighborsRegressor()
model_2.fit(x_train, y_train)

In [None]:
# Decision Tree

model_3 = DecisionTreeRegressor()
model_3.fit(x_train, y_train)

In [None]:
# now we can make predictions

y_train_lr = model_1.predict(x_train)
y_test_lr = model_1.predict(x_test)
y_train_knn = model_2.predict(x_train)
y_test_knn = model_2.predict(x_test)
y_train_dt = model_3.predict(x_train)
y_test_dt = model_3.predict(x_test)

In [None]:
# Linear Regression Evaluation

print("Test evaluation linear_regression_Train = ")
regression_evaluation(y_train, y_train_lr)
print("Test evaluation linear_regression_Test = ")
regression_evaluation(y_test, y_test_lr)


Test evaluation linear_regression_Train = 
MSE:  49795375.10250303
RMSE:  7056.583812476334
MAE:  3610.8042780092787
R^2:  0.990130421110961
Test evaluation linear_regression_Test = 
MSE:  167684870.08965218
RMSE:  12949.319290590227
MAE:  4626.918518083435
R^2:  0.962909880311584


In [None]:
# KNN Evaluation

print("Test evaluation KNN_Train = ")
regression_evaluation(y_train, y_train_knn)
print("Test evaluation KNN_Test = ")
regression_evaluation(y_test, y_test_knn)

Test evaluation KNN_Train = 
MSE:  688210284.5287708
RMSE:  26233.762302208404
MAE:  4866.691046853255
R^2:  0.863594848288164
Test evaluation KNN_Test = 
MSE:  764164769.541274
RMSE:  27643.530337879674
MAE:  6739.109461966605
R^2:  0.8309748354231168


In [None]:
# Decision Tree Evaluation

print("Test evaluation Decision_tree_Train = ")
regression_evaluation(y_train, y_train_dt)
print("Test evaluation Decision_tree_Test = ")
regression_evaluation(y_test, y_test_dt)

Test evaluation Decision_tree_Train = 
MSE:  8702296.360049918
RMSE:  2949.9654845523055
MAE:  1401.2605970906575
R^2:  0.998275181173663
Test evaluation Decision_tree_Test = 
MSE:  321942555.3588104
RMSE:  17942.757741183777
MAE:  4076.247354720453
R^2:  0.9287897118883258


**Step 6: Fine-Tuning the Models**

In [None]:
#  Fine-tuning KNN model
param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}
grid_knn = GridSearchCV(model_2, param_grid_knn, cv=5, scoring='neg_mean_squared_error')
grid_knn.fit(x_train, y_train)
best_knn = grid_knn.best_estimator_

# Predict with the best KNN model
y_pred_best_knn = best_knn.predict(x_test)
best_knn_results = regression_evaluation(y_test, y_pred_best_knn)

best_knn_results

MSE:  443827461.80890536
RMSE:  21067.21295779072
MAE:  5686.8016903731195
R^2:  0.9018300597382635
