# Phase 4: Modeling – Craigslist Cars & Trucks Dataset

## 1. Load and Prepare Data

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset (replace with your actual file path if needed)
df = pd.read_csv("vehicles_cleaned.csv")

# Features and targets
features = ['odometer_scaled', 'car_age_scaled']
X = df[features]
y = df['price']
y_class = df['price_category']

# Split for regression and classification
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_class, test_size=0.2, random_state=42)


## 2. Regression Modeling (Predict Price)

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_reg, y_train_reg)
y_pred_lr = lr.predict(X_test_reg)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_reg, y_train_reg)
y_pred_rf = rf.predict(X_test_reg)

# Evaluation
def evaluate_model(y_true, y_pred, name):
    print(f"Model: {name}")
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("RMSE:", mean_squared_error(y_true, y_pred, squared=False))
    print("R2:", r2_score(y_true, y_pred))
    print("-" * 30)

evaluate_model(y_test_reg, y_pred_lr, "Linear Regression")
evaluate_model(y_test_reg, y_pred_rf, "Random Forest")


## 3. Classification Modeling (Cheap vs Expensive)

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression
clf_lr = LogisticRegression()
clf_lr.fit(X_train_clf, y_train_clf)
y_pred_clf_lr = clf_lr.predict(X_test_clf)

# Random Forest Classifier
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train_clf, y_train_clf)
y_pred_clf_rf = clf_rf.predict(X_test_clf)

# Evaluation
print("Logistic Regression Report:")
print(classification_report(y_test_clf, y_pred_clf_lr))
print(confusion_matrix(y_test_clf, y_pred_clf_lr))

print("\nRandom Forest Classifier Report:")
print(classification_report(y_test_clf, y_pred_clf_rf))
print(confusion_matrix(y_test_clf, y_pred_clf_rf))


## 4. Visualize Regression Results

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Compare actual vs predicted for Random Forest
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test_reg, y=y_pred_rf)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Random Forest: Actual vs Predicted Price")
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], '--r')
plt.show()


## 5. Hyperparameter Tuning Example

In [None]:

from sklearn.model_selection import GridSearchCV

# Grid search for Random Forest Regressor
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_rf.fit(X_train_reg, y_train_reg)

print("Best Parameters:", grid_rf.best_params_)

# Evaluate the tuned model
best_rf = grid_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test_reg)
evaluate_model(y_test_reg, y_pred_best_rf, "Tuned Random Forest")
