In [1]:
# Importing the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings

In [3]:
# Reading the dataset
df = pd.read_csv("car_output.csv")

In [4]:
# Handling missing values if any
df = df.dropna()
df

Unnamed: 0,Car Count,Vehicle ID,Name,Model,Year,Stock Number,Mileage,Fuel Type,MPG City,MPG Highway,Vehicle Type,Price
0,1,3N1CN8EV5ML819115,Nissan,Versa,2021,7VCF97,62756,Gasoline,32,40,Sedan,17499
1,2,3N1CP5CUXKL552490,Nissan,Kicks,2019,7S3KF0,64076,Gasoline,31,36,Crossover,18499
2,3,3KPF24AD2ME267604,Kia,Forte,2021,7T0PL0,65183,Gasoline,29,40,Sedan,18999
3,4,5YFEPMAE1MP203462,Toyota,Corolla,2021,8CHTD5,70576,Gasoline,30,38,Sedan,19799
4,5,KNDPMCAC9M7863633,Kia,Sportage,2021,7T7LRJ,66055,Gasoline,21,25,Crossover,19999
...,...,...,...,...,...,...,...,...,...,...,...,...
2395,2396,1C4RJFAG3MC527425,Jeep,Grand Cherokee,2021,8DYXTS,20860,Gasoline,18,25,Crossover,32499
2396,2397,1GCGTCE37G1331477,Chevrolet,Colorado,2016,8F396V,13948,Gasoline,17,24,Truck,33999
2397,2398,JTEAAAAH7NJ092488,Toyota,Venza,2022,8DF1JM,27175,Hybrid,40,37,Sport Utility,33999
2398,2399,1GNERJKX8KJ170798,Chevrolet,Traverse,2019,8FXL3S,27680,Gasoline,20,26,Sport Utility,33999


In [5]:
# Dropping unnecessary columns
df = df.drop(['Car Count', 'Vehicle ID', 'Stock Number', 'Fuel Type', 'MPG City', 'MPG Highway', 'Vehicle Type'], axis = 1)

In [6]:
# Convert 'Price' column to numerical values
df['Price'] = df['Price'].str.replace(',', '').astype(float)

In [7]:
# Convert categorical features into numerical representations
label_encoder = LabelEncoder()
df['Name'] = label_encoder.fit_transform(df['Name'])
df['Model'] = label_encoder.fit_transform(df['Model'])

In [8]:
# Split the data into features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Function to evaluate the performance of a model and print the metrics
def evaluate_model(model_name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{model_name} Model Evaluation:')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'R-squared: {r2:.2f}\n')

In [11]:
# Initialize and train different machine learning models
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

k_neighbors = KNeighborsRegressor()
k_neighbors.fit(X_train, y_train)

# Suppress warnings during SVR model fitting
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    svr = SVR()
    svr.fit(X_train, y_train)

In [12]:
# Evaluate the performance of each model
evaluate_model('Linear Regression', linear_regression, X_test, y_test)
evaluate_model('Random Forest', random_forest, X_test, y_test)
evaluate_model('K-Neighbors', k_neighbors, X_test, y_test)
evaluate_model('Support Vector Regression', svr, X_test, y_test)

Linear Regression Model Evaluation:
Mean Squared Error: 40513203.66
Mean Absolute Error: 4660.14
R-squared: 0.11

Random Forest Model Evaluation:
Mean Squared Error: 9350622.81
Mean Absolute Error: 1818.47
R-squared: 0.79

K-Neighbors Model Evaluation:
Mean Squared Error: 50770571.45
Mean Absolute Error: 5060.15
R-squared: -0.11

Support Vector Regression Model Evaluation:
Mean Squared Error: 47116090.99
Mean Absolute Error: 4692.73
R-squared: -0.03

