## Import the Necessary Libraries :

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [10]:
# Load the datset
car = pd.read_csv('CarPrice_Assignment.csv')
car

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [11]:
car.keys()

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [12]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [13]:
car.shape

(205, 26)

In [14]:
car.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [15]:
def door_map(value):
    return {'two': 2, 'four': 4}.get(value, None)

In [16]:
# Convert of value from (two & four) to (2 & 4)
car['doornumber'] = car['doornumber'].apply(door_map)

In [17]:
car.doornumber

0      2
1      2
2      2
3      4
4      4
      ..
200    4
201    4
202    4
203    4
204    4
Name: doornumber, Length: 205, dtype: int64

In [18]:
def cylinder_map(value):
    return {'two': 2, 'four': 4, 'five': 5, 'six': 6, 'eight': 8}.get(value, None)

In [19]:
# Convert of value from (two,four,five,six & eight) to (2,4,5,6 & 8)
car['cylindernumber'] = car['cylindernumber'].apply(cylinder_map)

In [20]:
car.cylindernumber

0      4.0
1      4.0
2      6.0
3      4.0
4      5.0
      ... 
200    4.0
201    4.0
202    6.0
203    6.0
204    4.0
Name: cylindernumber, Length: 205, dtype: float64

In [21]:
d = ['car_ID', 'CarName', 'fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'fuelsystem']
result = car.pop('price')
data = car.drop(columns=d)

In [22]:
# Split the dataset into training and testing data
training_data, testing_data, training_result, testing_result = train_test_split(data, result, test_size=0.11, random_state=100)

In [23]:
# To deal with missing values
imputer = SimpleImputer(strategy='mean')  # can also use 'median' or 'most_frequent'
training_data = imputer.fit_transform(training_data)   # computes the mean of column and assign it to missing value
testing_data = imputer.transform(testing_data)   # fit is not used to prevent data leakage

In [24]:
# Fit on the training data and transform both training and testing data
scaler = StandardScaler()
training_data = scaler.fit_transform(training_data)
testing_data = scaler.transform(testing_data)

In [25]:
training_data

array([[ 0.14043837, -1.12915898, -0.85002626, ...,  0.81605662,
         1.73975421,  1.4455764 ],
       [ 0.96494749, -1.12915898, -0.0446292 , ..., -0.66810631,
        -0.22504696, -0.14368442],
       [-0.68407075,  0.88561489,  0.29809295, ...,  0.17998679,
        -1.28301683, -1.29951047],
       ...,
       [-1.50857987,  0.88561489,  1.94315927, ..., -1.62221105,
        -0.52732407, -0.8660757 ],
       [ 0.14043837,  0.88561489, -0.85002626, ...,  0.81605662,
         0.8329229 ,  1.01214163],
       [ 0.14043837,  0.88561489,  1.22344276, ...,  0.81605662,
        -1.28301683, -1.58846698]])

In [26]:
testing_data

array([[-6.84070754e-01,  8.85614886e-01, -5.07304107e-01,
        -6.03991797e-01, -6.67749158e-01, -2.63459372e-01,
        -8.46349644e-01, -3.76897345e-01, -6.75471492e-01,
        -4.46468646e-01, -7.59242958e-01, -3.15425884e-01,
        -8.24201217e-01, -6.68106309e-01,  1.89089276e+00,
         2.31244594e+00],
       [ 9.64947487e-01,  8.85614886e-01, -2.33126386e-01,
        -1.61664570e-01, -1.60604312e-01,  8.38361354e-01,
        -5.01635871e-01, -3.76897345e-01, -4.07318820e-01,
        -4.46468646e-01,  4.33650668e-01, -3.15425884e-01,
        -4.41080270e-01,  2.85998429e-01,  2.28368691e-01,
         4.34228606e-01],
       [ 1.40438367e-01, -1.12915898e+00,  2.39152270e-02,
         3.38001371e-01,  3.00436457e-01,  2.21978534e-02,
        -2.92141312e-01, -3.76897345e-01, -9.04111159e-02,
         3.08359906e-01,  4.01410299e-01, -4.11417040e-01,
        -4.66621666e-01, -6.68106309e-01,  7.72301392e-02,
         1.45272093e-01],
       [ 1.40438367e-01, -1.12915898e

In [27]:
print("Training Data Shape: ", training_data.shape)
print("Training Result Shape: ", training_result.shape)
print("Testing Data Shape: ", testing_data.shape)
print("Testing Result Shape: ", testing_result.shape)

Training Data Shape:  (182, 16)
Training Result Shape:  (182,)
Testing Data Shape:  (23, 16)
Testing Result Shape:  (23,)


## Linear Regression Model

In [29]:
# Initialize the Linear Regression model
linear_model = LinearRegression()

# Train the Linear Regression model on the training data 
linear_model.fit(training_data, training_result)

# Predict the labels for the testing data
linear_pred = linear_model.predict(testing_data)

# Calculate the R² score & Mean Squared Error (MSE) for the testing dataset
linear_r2 = r2_score(testing_result, linear_pred)
linear_mse = mean_squared_error(testing_result, linear_pred)

# Print the performance metrics of the Linear Regression model
print("R^2 Score: ", linear_r2)
print("Mean Squared Error: ", linear_mse)

R^2 Score:  0.8922543914580489
Mean Squared Error:  6621330.103166812


In [30]:
testing_result.shape

testing_result_matrix = testing_result.values.reshape(-1, 1)

testing_result_matrix.shape

(23, 1)

In [31]:
comparison_df = pd.DataFrame({'Actual': testing_result_matrix.flatten(), 'Predicted': linear_pred.flatten()})

comparison_df.head(10)

Unnamed: 0,Actual,Predicted
0,7738.0,6010.179108
1,8495.0,10142.265042
2,8845.0,10114.534755
3,9298.0,11952.287775
4,7603.0,9491.793638
5,11245.0,10047.698547
6,18420.0,15389.202657
7,16503.0,18053.456807
8,17669.0,14525.835187
9,17199.0,23586.10653


## K-Nearest Neighbors (KNN) Model

In [33]:
# Initialize the K-Nearest Neighbors (KNN) Regressor with  neighbors
knn_model = KNeighborsRegressor(n_neighbors=3)

# Train the KNN model on the training data and labels
knn_model.fit(training_data, training_result)

# Predict the labels for the testing data
knn_pred = knn_model.predict(testing_data)

# Calculate the R² score & Mean Squared Error (MSE) for the testing dataset
knn_r2 = r2_score(testing_result, knn_pred)
knn_mse = mean_squared_error(testing_result, knn_pred)

# Print the performance metrics of the KNN model
print("R^2 Score: ", knn_r2)
print("Mean Squared Error: ", knn_mse)

R^2 Score:  0.9169860420418339
Mean Squared Error:  5101486.977053139


## Decision Tree Model

In [35]:
# Initialize the Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42, max_depth=4, min_samples_leaf=5)

# Train the model
dt_model.fit(training_data, training_result)

# Predict on the testing data
dt_pred = dt_model.predict(testing_data)

# Calculate the R² score & Mean Squared Error (MSE) for the testing dataset
dt_r2 = r2_score(testing_result, dt_pred)
dt_mse = mean_squared_error(testing_result, dt_pred)

# Print the performance metrics of the KNN model
print("R^2 Score: ", dt_r2)
print("Mean Squared Error: ", dt_mse)

R^2 Score:  0.8700540790489801
Mean Squared Error:  7985613.982974692


## Adaboost Regressor

In [37]:
# Initialize and train the Adaboost Regressor model
adb_model = AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
adb_model.fit(training_data, training_result)

# Make predictions
adb_pred = adb_model.predict(testing_data)

# Calculate the R² score & Mean Squared Error (MSE) for the testing dataset
adb_r2 = r2_score(testing_result, adb_pred)
adb_mse = mean_squared_error(testing_result, adb_pred)

# Print the performance metrics of the Rnadom Forest model
print("R^2 Score: ", adb_r2)
print("Mean Squared Error: ", adb_mse)

R^2 Score:  0.9541454429201923
Mean Squared Error:  2817916.7881510104


## Gradient Boosting Regressor

In [39]:
# Initialize and train the Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(training_data, training_result)

# Make predictions
gb_pred = gb_model.predict(testing_data)

# Calculate the R² score & Mean Squared Error (MSE) for the testing dataset
gb_r2 = r2_score(testing_result, gb_pred)
gb_mse = mean_squared_error(testing_result, gb_pred)

# Print the performance metrics of the Rnadom Forest model
print("R^2 Score: ", gb_r2)
print("Mean Squared Error: ", gb_mse)

R^2 Score:  0.9437870684821542
Mean Squared Error:  3454473.7431358756


## XGBoost Regressor

In [41]:
# Initialize and train the XGBoost Regressor model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(training_data, training_result)

# Make predictions
xgb_pred = xgb_model.predict(testing_data)

# Calculate the R² score & Mean Squared Error (MSE) for the testing dataset
xgb_r2 = r2_score(testing_result, xgb_pred)
xgb_mse = mean_squared_error(testing_result, xgb_pred)

# Print the performance metrics of the Rnadom Forest model
print("R^2 Score: ", xgb_r2)
print("Mean Squared Error: ", xgb_mse)

R^2 Score:  0.9441874441659198
Mean Squared Error:  3429869.310497108
