In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load the prepared data from the previous stage
df_model = pd.read_pickle('data/02_data_prepared.pkl')

# --- 1. Define X (features) and y (target) ---
X = df_model.drop('price', axis=1)
y = df_model['price']

# --- 2. Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}\n")

# --- 3. Train and evaluate models ---

# --- Model 1: Linear Regression (Baseline) ---
print("--- Training Linear Regression ---")
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
predictions_lr = model_lr.predict(X_test)
mae_lr = mean_absolute_error(y_test, predictions_lr)
r2_lr = r2_score(y_test, predictions_lr)
print(f"Mean Absolute Error (MAE): ${mae_lr:,.0f}")
print(f"R-squared (R²): {r2_lr:.2f}\n")

# --- Model 2: Random Forest ---
print("--- Training Random Forest ---")
model_rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_rf.fit(X_train, y_train)
predictions_rf = model_rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, predictions_rf)
r2_rf = r2_score(y_test, predictions_rf)
print(f"Mean Absolute Error (MAE): ${mae_rf:,.0f}")
print(f"R-squared (R²): {r2_rf:.2f}\n")

# --- Model 3: Gradient Boosting ---
print("--- Training Gradient Boosting ---")
model_gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
model_gb.fit(X_train, y_train)
predictions_gb = model_gb.predict(X_test)
mae_gb = mean_absolute_error(y_test, predictions_gb)
r2_gb = r2_score(y_test, predictions_gb)
print(f"Mean Absolute Error (MAE): ${mae_gb:,.0f}")
print(f"R-squared (R²): {r2_gb:.2f}\n")

# --- 4. Cross-Validation (Fairer evaluation) ---
print("--- Model Evaluation with Cross-Validation (average MAE) ---")
score_lr = -cross_val_score(model_lr, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
print(f"Linear Regression CV MAE: ${score_lr:,.0f}")

score_rf = -cross_val_score(model_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
print(f"Random Forest CV MAE: ${score_rf:,.0f}")

score_gb = -cross_val_score(model_gb, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
print(f"Gradient Boosting CV MAE: ${score_gb:,.0f}\n")

# --- 5. Feature Importance Analysis ---
importances = model_rf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
top_features = feature_importances.sort_values(by='importance', ascending=False).head(15)

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=top_features)
plt.title('Top 15 Most Important Features (Random Forest)')
plt.show()

In [None]:
import joblib
import os

# model_rf is our best trained Random Forest model from the evaluation steps
# Save the model to a file inside the 'models' directory
# The name 'random_forest_v1.joblib' is an example version
joblib.dump(model_rf, 'models/random_forest_v1.joblib')

print("Model successfully saved to the /models folder")