<a href="https://colab.research.google.com/github/SeanMuInCa/learn_python/blob/master/house_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing

# Step 2: Load California Housing Data (as an example)
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target * 100000  # Convert to actual dollar values (California house prices)

# Step 3: Split Data into Train and Test Sets (80-20 Split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a Baseline Model (Without Tuning)
baseline_model = RandomForestRegressor(random_state=42)
baseline_model.fit(X_train, y_train)

# Predict house prices using the baseline model
baseline_predictions = baseline_model.predict(X_test)

# Step 5: Evaluate the Baseline Model
mse_baseline = mean_squared_error(y_test, baseline_predictions)
r2_baseline = r2_score(y_test, baseline_predictions)

print(f"Before Tuning - MSE: {mse_baseline:.2f}, R²: {r2_baseline:.2f}")

# Step 6: Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [5, 10, 20],        # Tree depth
    'min_samples_split': [2, 5, 10]  # Minimum samples to split a node
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42),
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

# Step 7: Train the Optimized Model (Using Best Parameters)
best_model = grid_search.best_estimator_

# Predict house prices using the optimized model
optimized_predictions = best_model.predict(X_test)

# Step 8: Evaluate the Optimized Model
mse_optimized = mean_squared_error(y_test, optimized_predictions)
r2_optimized = r2_score(y_test, optimized_predictions)

print(f"After Tuning - MSE: {mse_optimized:.2f}, R²: {r2_optimized:.2f}")
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Step 9: Show Real Predictions vs Actual Prices
comparison_df = pd.DataFrame({
    "Actual Price ($)": y_test[:10],
    "Baseline Prediction ($)": baseline_predictions[:10],
    "Optimized Prediction ($)": optimized_predictions[:10]
})

from IPython.display import display
display(comparison_df)


Before Tuning - MSE: 2571542838.41, R²: 0.80
Fitting 5 folds for each of 27 candidates, totalling 135 fits
After Tuning - MSE: 2550367336.58, R²: 0.81
Best Hyperparameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


Unnamed: 0,Actual Price ($),Baseline Prediction ($),Optimized Prediction ($)
0,47700.0,51208.0,49598.223141
1,45800.0,73834.0,73931.692969
2,500001.0,495177.75,487723.175
3,218600.0,253689.0,254186.251948
4,278000.0,225876.0,226611.252942
5,158700.0,165728.0,165144.591908
6,198200.0,235075.0,236846.836061
7,157500.0,166960.0,166428.775316
8,340000.0,279674.07,264395.717026
9,446600.0,491775.88,489612.87
