In [2]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

cleaned_data_path = os.path.abspath(os.path.join('..', 'data', 'cleaned', 'car_prices_cleaned.csv'))
car_df = pd.read_csv(cleaned_data_path)
car_df.head(5)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,2014-12-16
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,2014-12-16
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331.0,gray,black,financial services remarketing (lease),31900,30000,2015-01-14
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282.0,white,black,volvo na rep/world omni,27500,27750,2015-01-28
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641.0,gray,black,financial services remarketing (lease),66000,67000,2014-12-18


In [3]:
# Feature and target selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
#Split the data 80/20 for train and test
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(
    car_df[['odometer', 'condition']],
    car_df['sellingprice'],
    test_size=0.2
)

#Split the train data 80/20 for tain and valid
x_train_df, x_valid_df, y_train_df, y_valid_df = train_test_split(
    x_train_df,
    y_train_df,
    test_size=0.2
)

print("Train: ", x_train_df.shape, y_train_df.shape)
print("Valid: ", x_valid_df.shape, y_valid_df.shape)
print("Test: ", x_test_df.shape, y_test_df.shape)

# Convert the dataframes to numpy array
x_train, y_train = x_train_df.values, y_train_df.values
x_valid, y_valid = x_valid_df.values, y_valid_df.values
x_test, y_test = x_test_df.values, y_test_df.values

Train:  (302294, 2) (302294,)
Valid:  (75574, 2) (75574,)
Test:  (94468, 2) (94468,)


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# First 3 experiments
def regression_analysis(model, model_name):
    model.fit(x_train, y_train)

# Predict on validation and testing set
    y_valid_prediction = model.predict(x_valid)
    y_test_prediction = model.predict(x_test)

    # Check the performance of the model
    valid_mse = root_mean_squared_error(y_valid, y_valid_prediction)
    test_mse = root_mean_squared_error(y_test,y_test_prediction)
    valid_r2 = r2_score(y_valid, y_valid_prediction)
    test_r2 = r2_score(y_test,y_test_prediction)

    print(model_name + " Performance:")
    print(f"Validation RMSE: {valid_mse}, Validation R²: {valid_r2}")
    print(f"Test RMSE: {test_mse}, Test R²: {test_r2}")


# Intialize the model
for i in range(3):
    if i == 0:
        print("\nExperiment 1: Linear Regression:\n")
        model = LinearRegression()
        model_name = "Linear Regression"
        regression_analysis(model,model_name)
    elif i == 1:
        print("\nExperiment 2: Random Forest Regressor:\n")
        model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
        model_name = "Random Forest Regressor"
        regression_analysis(model,model_name)
    else:
        print("\n Experiment 3: Decision Tree Regressor:\n")
        model = DecisionTreeRegressor(max_depth=7, random_state=42)
        model_name= "Decision Tree Regressor"
        regression_analysis(model,model_name)





Experiment 1: Linear Regression:

Linear Regression Performance:
Validation RMSE: 7385.217326394457, Validation R²: 0.4057324918418722
Test RMSE: 7338.631017020563, Test R²: 0.4092453745172516

Experiment 2: Random Forest Regressor:

Random Forest Regressor Performance:
Validation RMSE: 7168.964695419425, Validation R²: 0.44002542459522176
Test RMSE: 7132.715043442208, Test R²: 0.4419324445279105

 Experiment 3: Decision Tree Regressor:

Decision Tree Regressor Performance:
Validation RMSE: 7181.627675852437, Validation R²: 0.4380454428323166
Test RMSE: 7149.294021878799, Test R²: 0.4393351326741032
