# Phase 4: Improved Modeling – Craigslist Cars & Trucks Dataset


This notebook builds and evaluates predictive models for car price estimation using cleaned vehicle data.
We use features such as odometer, car age, and categorical attributes like fuel type, transmission, etc.
Three models are trained and compared:

- Linear Regression
- Random Forest Regressor
- XGBoost Regressor

Target variable (`price`) is log-transformed to reduce skew.


## 1. Load Data

In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Load cleaned dataset
df = pd.read_csv("../../downloads/vehicles_cleaned.csv")
df.head()


Unnamed: 0,region,region_url,price,year,manufacturer,model,condition,cylinders,fuel,odometer,...,VIN,drive,type,paint_color,state,posting_date,car_age,odometer_scaled,car_age_scaled,price_category
0,auburn,https://auburn.craigslist.org,33590,2014.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,...,3GTP1VEC4EG551563,,pickup,white,al,2021-05-04T12:31:18-0500,11.0,0.193075,0.065574,expensive
1,auburn,https://auburn.craigslist.org,22590,2010.0,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,...,1GCSCSE06AZ123805,,pickup,blue,al,2021-05-04T12:31:08-0500,15.0,0.237428,0.098361,expensive
2,auburn,https://auburn.craigslist.org,39590,2020.0,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,...,3GCPWCED5LG130317,,pickup,red,al,2021-05-04T12:31:25-0500,5.0,0.063864,0.016393,expensive
3,auburn,https://auburn.craigslist.org,30990,2017.0,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,...,5TFRM5F17HX120972,,pickup,red,al,2021-05-04T10:41:31-0500,8.0,0.137078,0.040984,expensive
4,auburn,https://auburn.craigslist.org,15000,2013.0,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,...,,rwd,truck,black,al,2021-05-03T14:02:03-0500,12.0,0.426666,0.07377,cheap


## 2. Feature Selection and Preprocessing

In [3]:

# High-signal features
features = [
    'year', 'odometer', 'car_age', 'condition', 'manufacturer',
    'fuel', 'type', 'transmission', 'drive', 'paint_color'
]
target = 'price'

# Select available features
available_features = [col for col in features if col in df.columns]
df_model = df[available_features + [target]].dropna().copy()

# One-hot encode categoricals
df_model = pd.get_dummies(df_model, drop_first=True)

# Define features/target
X = df_model.drop(columns=[target])
y = df_model[target]
y_log = np.log1p(y)

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)


## 3. Scale Features

In [4]:

scaler = MinMaxScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train)
X_test[X_test.columns] = scaler.transform(X_test)


## 4. Evaluation Function

In [5]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(y_true_log, y_pred_log, name):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    print(f"Model: {name}")
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("R2:", r2_score(y_true, y_pred))
    print("-" * 30)


## 5. Linear Regression

In [6]:

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
evaluate_model(y_test, y_pred_lr, "Linear Regression")


Model: Linear Regression
MAE: 5057.708188818847
RMSE: 7798.591638465047
R2: 0.6958482418858496
------------------------------


## 6. Random Forest Regressor

In [7]:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
evaluate_model(y_test, y_pred_rf, "Random Forest")


Model: Random Forest
MAE: 2103.237028858954
RMSE: 4474.194006165087
R2: 0.8998876640465445
------------------------------


## 7. XGBoost Regressor

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# 1. Param grid (trimmed for speed + valid values only)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']  # 'auto' is invalid for GBRT
}

# 2. Model and grid search
gb_reg = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=gb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# 3. Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# 4. Train best model with early stopping
best_params = grid_search.best_params_

gb_model = GradientBoostingRegressor(
    **best_params,
    random_state=42,
    validation_fraction=0.1,
    n_iter_no_change=5,
    tol=1e-4
)
gb_model.fit(X_train, y_train)

# 5. Predict and evaluate
gb_predictions = gb_model.predict(X_test)

gb_mse = mean_squared_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

# If you used log-transformed targets:
print(f"MAE: {mean_absolute_error(np.expm1(y_test), np.expm1(gb_predictions))}")
print(f"RMSE: {np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(gb_predictions)))}")
print(f"R²: {r2_score(np.expm1(y_test), np.expm1(gb_predictions))}")


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
MAE: 3910.2636665085734
RMSE: 6344.5398145901345
R²: 0.7986933017017985


In [11]:
# Additional import for deep learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Build the deep learning model
dl_model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

# Compile the model
dl_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the model
history = dl_model.fit(
    X_train, y_train, 
    epochs=50, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1
)

# Predictions
y_pred_dl = dl_model.predict(X_test).flatten()

# Evaluate (if log-transform used)
y_test_exp = np.expm1(y_test)
y_pred_dl_exp = np.expm1(y_pred_dl)

print("Deep Learning Model Evaluation:")
print("MAE:", mean_absolute_error(y_test_exp, y_pred_dl_exp))
print("RMSE:", np.sqrt(mean_squared_error(y_test_exp, y_pred_dl_exp)))
print("R2:", r2_score(y_test_exp, y_pred_dl_exp))




Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 809us/step - loss: 0.6958 - val_loss: 0.1589
Epoch 2/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 817us/step - loss: 0.1580 - val_loss: 0.1605
Epoch 3/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 797us/step - loss: 0.1504 - val_loss: 0.1457
Epoch 4/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 784us/step - loss: 0.1437 - val_loss: 0.1508
Epoch 5/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 775us/step - loss: 0.1408 - val_loss: 0.1390
Epoch 6/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 788us/step - loss: 0.1371 - val_loss: 0.1384
Epoch 7/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 794us/step - loss: 0.1356 - val_loss: 0.1341
Epoch 8/50
[1m3828/3828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 774us/step - loss: 0.1322 - val_loss: 0.1362
Epoch 9/50
[1m3828