In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
def save_results(model_name, rmsle_cv, rmsle_train, mae_train):
    """Saves the performance metrics to a CSV file for comparison."""
    try:
        results_df = pd.read_csv('model_comparison_results.csv')
    except FileNotFoundError:
        results_df = pd.DataFrame(columns=['Model', 'CV_RMSLE_Mean', 'Train_RMSLE', 'Train_MAE'])

    new_result = pd.DataFrame({
        'Model': [model_name],
        'CV_RMSLE_Mean': [rmsle_cv],
        'Train_RMSLE': [rmsle_train],
        'Train_MAE': [mae_train]
    })
    if model_name in results_df['Model'].values:
        results_df = results_df[results_df['Model'] != model_name]
    
    results_df = pd.concat([results_df, new_result], ignore_index=True)
    results_df.to_csv('model_comparison_results.csv', index=False)
try:
    X_train = pd.read_csv('X_train_processed.csv')
    X_test = pd.read_csv('X_test_processed.csv')
    y_train = np.load('y_train_processed.npy')
    X_test_ID = pd.read_csv('test.csv')['Id']
except FileNotFoundError:
    print("Error: Processed data files not found. Ensure 'main_data_preprocessing.ipynb' was run first.")
    exit()
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
print("--- Implementing Random Forest Regressor ---")
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, None],
    'min_samples_split': [5, 10]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=kfolds, 
                           n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print(f"\nBest Hyperparameters found: {grid_search.best_params_}")
best_rf.fit(X_train, y_train)
y_train_pred = best_rf.predict(X_train)

cv_scores = cross_val_score(best_rf, X_train, y_train, 
                            scoring='neg_mean_squared_error', cv=kfolds)
cv_rmse_scores = np.sqrt(-cv_scores)

final_cv_rmsle = cv_rmse_scores.mean()
final_train_rmsle = rmsle(y_train, y_train_pred)
final_train_mae = mean_absolute_error(y_train, y_train_pred)

print("\n--- Evaluation on Training Data (Log-Transformed) ---")
print(f"Random Forest CV Mean RMSE: {final_cv_rmsle:.4f} (Std: {cv_rmse_scores.std():.4f})")
print(f"Random Forest Training RMSLE: {final_train_rmsle:.4f}")
print(f"Random Forest Training MAE: {final_train_mae:.4f}")


feature_importances = pd.Series(best_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 8))
feature_importances.head(15).plot(kind='barh', title='Top 15 Feature Importances (Random Forest)')
plt.show() 

plt.figure(figsize=(10, 6))
plt.scatter(y_train_pred, y_train - y_train_pred, c='green', marker='o', alpha=0.5)
plt.hlines(y=0, xmin=y_train_pred.min(), xmax=y_train_pred.max(), color='red')
plt.title('Residual Plot (Random Forest Regressor)')
plt.xlabel('Predicted Log(SalePrice)')
plt.ylabel('Residuals')
plt.show()

y_test_pred_log = best_rf.predict(X_test)
y_test_pred_price = np.expm1(y_test_pred_log) 
rf_results = pd.DataFrame({'Id': X_test_ID, 'SalePrice_Predicted': y_test_pred_price})
rf_results.to_csv('random_forest_predictions.csv', index=False) 

save_results(
    model_name='Random Forest',
    rmsle_cv=final_cv_rmsle,
    rmsle_train=final_train_rmsle,
    mae_train=final_train_mae
)

print("\nRandom Forest Regressor Model implementation complete. Results saved.")