Step 1: Data Preparation and Feature Engineering

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder

# Loading the cleaned dataset
df = pd.read_csv('car_prices_cleaned.csv')

# Defining features and target variable
X = df.drop(columns=['sellingprice', 'vin', 'saledate'])
y = df['sellingprice']

# Identifying categorical columns (assuming object dtype indicates categorical)
categorical_cols = X.select_dtypes(include=['object']).columns

# Applying one-hot encoding to categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Handle unknown categories
X_encoded = pd.DataFrame(ohe.fit_transform(X[categorical_cols]))
X_encoded.columns = ohe.get_feature_names_out(categorical_cols)  # Set column names
X = X.drop(categorical_cols, axis=1).join(X_encoded)  # Combine numerical and encoded features

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Step 2: Training the Model

In [6]:
# Initializing the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Training the model on the training data
rf_model.fit(X_train, y_train)


Step 3: Evaluate the Model

In [8]:
# Predicting on the testing data
y_pred = rf_model.predict(X_test)

# Calculating Mean Squared Error (MSE) and R-squared (R2) score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R2) Score: {r2}')


Mean Squared Error (MSE): 3305479.344459166
R-squared (R2) Score: 0.9569964196101743


In [9]:
# Creating a DataFrame with actual values, predicted values, and their differences
pred_df = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': y_pred, 'Difference': y_test - y_pred})

# Displaying the DataFrame
print(pred_df)

      Actual Value  Predicted Value  Difference
4362       14300.0          15778.5     -1478.5
1805       24250.0          23568.0       682.0
5458       14500.0          13739.5       760.5
6797        8200.0           9793.0     -1593.0
9541       10400.0          10812.5      -412.5
...            ...              ...         ...
3282       13250.0          12054.5      1195.5
6109       28600.0          27959.4       640.6
3265        5800.0           4411.0      1389.0
3464       19000.0          20072.5     -1072.5
3528        3100.0           2262.0       838.0

[1942 rows x 3 columns]


Checking the Model Performance

In [10]:
# Performance on the training set
y_train_pred = rf_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Performance on the testing set
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f'Training Mean Squared Error (MSE): {train_mse}')
print(f'Training R-squared (R2) Score: {train_r2}')
print(f'Testing Mean Squared Error (MSE): {test_mse}')
print(f'Testing R-squared (R2) Score: {test_r2}')

Training Mean Squared Error (MSE): 371479.6875209299
Training R-squared (R2) Score: 0.9951548841773439
Testing Mean Squared Error (MSE): 3305479.344459166
Testing R-squared (R2) Score: 0.9569964196101743


The model accuracy is 95%