In [4]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

cleaned_data_path = os.path.abspath(os.path.join('..', 'data', 'cleaned', 'car_prices_cleaned.csv'))
car_df = pd.read_csv(cleaned_data_path)
car_df.head(5)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,2014-12-16
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,2014-12-16
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331.0,gray,black,financial services remarketing (lease),31900,30000,2015-01-14
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282.0,white,black,volvo na rep/world omni,27500,27750,2015-01-28
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641.0,gray,black,financial services remarketing (lease),66000,67000,2014-12-18


In [13]:
# Feature and target selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
#Split the data 80/20 for train and test
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(
    car_df[['year', 'odometer', 'condition', 'mmr']],
    car_df['sellingprice'],
    test_size=0.2
)

#Split the train data 80/20 for tain and valid
x_train_df, x_valid_df, y_train_df, y_valid_df = train_test_split(
    x_train_df,
    y_train_df,
    test_size=0.2
)

print("Train: ", x_train_df.shape, y_train_df.shape)
print("Valid: ", x_valid_df.shape, y_valid_df.shape)
print("Test: ", x_test_df.shape, y_test_df.shape)

# Convert the dataframes to numpy array
x_train, y_train = x_train_df.values, y_train_df.values
x_valid, y_valid = x_valid_df.values, y_valid_df.values
x_test, y_test = x_test_df.values, y_test_df.values

Train:  (302294, 4) (302294,)
Valid:  (75574, 4) (75574,)
Test:  (94468, 4) (94468,)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# First 3 experiments
def regression_analysis(model, model_name):
    model.fit(x_train, y_train)

# Predict on validation and testing set
    y_valid_prediction = model.predict(x_valid)
    y_test_prediction = model.predict(x_test)

    # Check the performance of the model
    valid_mse = root_mean_squared_error(y_valid, y_valid_prediction)
    test_mse = root_mean_squared_error(y_test,y_test_prediction)
    valid_r2 = r2_score(y_valid, y_valid_prediction)
    test_r2 = r2_score(y_test,y_test_prediction)

    print(model_name + " Performance:")
    print(f"Validation RMSE: {valid_mse:.2f}, Validation R²: {valid_r2:.2f}")
    print(f"Test RMSE: {test_mse:.2f}, Test R²: {test_r2:.2f}")



# Intialize the model
for i in range(3):
    if i == 0:
        
        print("\nExperiment 1: Linear Regression:\n")
        model = LinearRegression()
        model_name = "Linear Regression"
        regression_analysis(model,model_name)
    elif i == 1:
        print("\nExperiment 2: Random Forest Regressor:\n")
        model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=50)
        model_name = "Random Forest Regressor"
        regression_analysis(model,model_name)
    else:
        print("\nExperiment 3: Decision Tree Regressor:\n")
        model = DecisionTreeRegressor(max_depth=7, random_state=50)
        model_name= "Decision Tree Regressor"
        regression_analysis(model,model_name)





Experiment 1: Linear Regression:

Linear Regression Performance:
Validation RMSE: 1550.07, Validation R²: 0.97
Test RMSE: 1563.32, Test R²: 0.97

Experiment 2: Random Forest Regressor:

Random Forest Regressor Performance:
Validation RMSE: 1500.37, Validation R²: 0.98
Test RMSE: 1530.58, Test R²: 0.97

Experiment 3: Decision Tree Regressor:

Decision Tree Regressor Performance:
Validation RMSE: 1582.18, Validation R²: 0.97
Test RMSE: 1603.22, Test R²: 0.97


## For each experiment, answer the following:

### Experiment 1: Linear Regression
1. What input data and target (output) data did you use for the prediction task
The input data that was initally used for this was the odometer column and the condition column. However this was later modified to include year and mmr for improved performance.

2. How did your model perform on the train and test set?  
The performance for linear regression was:
* Validation RMSE: 1550.07, Validation R²: 0.97
* Test RMSE: 1563.32, Test R²: 0.97
* This showcases that the accuracy is pretty good.

3. From the low R² score and low RMSE score (low given the size of the dataset), it seems the model is neither overfitting or underfitting.

4. No changes required.

5. What can you potentially do on the data side to increase the performance further (find more data, more of a specific type of data etc.)
* To potentially increase performance it is possible to include data points from another source, that has more infomation like gas milage information, or engine size.

## Experiment 2: Random Forest Regressor
1. What input data and target (output) data did you use for the prediction task
* The input data that was initally used for this was the odometer column and the condition column. However this was later modified to include year and mmr for improved performance.

2. How did your model perform on the train and test set? 
The performance for random forest regressor was:
* Validation RMSE: 1500.37, Validation R²: 0.98
* Test RMSE: 1530.58, Test R²: 0.97
* This showcases that the accuracy is pretty good.

3. From the low R² score and low RMSE score (low given the size of the dataset), it seems the model is neither overfitting or underfitting.

4. No changes required.

5. What can you potentially do on the data side to increase the performance further (find more data, more of a specific type of data etc.)
* To potentially increase performance it is possible to include data points from another source, that has more infomation like gas milage information, or engine size.

## Experiment 3: Decision Tree Regressor
1. What input data and target (output) data did you use for the prediction task.
* The input data that was initally used for this was the odometer column and the condition column. However this was later modified to include year and mmr for improved performance.

2. How did your model perform on the train and test set? 
The performance for decision tree regressor was:
* Validation RMSE: 1582.18, Validation R²: 0.97
* Test RMSE: 1603.22, Test R²: 0.97
* This showcases that the accuracy is pretty good.
3. Did your model overfit or underfit? 

* The accuracy is overall good just like the other models, however the RMSE score is higher in this model showing that it is close to overfitting.

4. If so, try addressing this problem either by modifying the model or modifying the data. This counts as another iteration within the same experiment. What changes did you make and how much did they help?
* Some changes to further improve the possible overfitting issue is adjesting the depth for instance.

5. What can you potentially do on the data side to increase the performance further (find more data, more of a specific type of data etc.)
* To potentially increase performance it is possible to include data points from another source, that has more infomation like gas milage information, or engine size. Possibly exploring feature transformations could also help improve the model.
