In [2]:
# Import necessary libraries
import pandas as pd  # For handling tabular data
from sklearn.datasets import fetch_california_housing  # Built-in dataset loader
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.ensemble import RandomForestRegressor  # Random Forest model for regression
from sklearn.metrics import mean_squared_error, r2_score  # Model evaluation metrics


# Load the California Housing dataset
california_housing = fetch_california_housing()

# Convert the dataset into a DataFrame for easier data manipulation
california_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Add the target column (house median value) to the DataFrame
california_data['MEDV'] = california_housing.target


# Separate features (X) and target variable (y)
X = california_data.drop('MEDV', axis=1)  # All columns except MEDV
y = california_data['MEDV']  # Target column (median house value)


# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize the Random Forest Regressor
# n_estimators = 100 → number of trees in the forest
# random_state = 42 → ensures reproducibility
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)


# Train (fit) the model using the training data
rf_regressor.fit(X_train, y_train)


# Predict house prices on the test data
y_pred = rf_regressor.predict(X_test)


# Evaluate model performance using Mean Squared Error (MSE) and R² score
mse = mean_squared_error(y_test, y_pred)  # Measures average squared difference between predicted and actual values
r2 = r2_score(y_test, y_pred)  # Measures how well the model explains the variance in the data


# Make a single prediction example for one test data point
single_data = X_test.iloc[[0]]  # Reshape to match model input format
predicted_value = rf_regressor.predict(single_data)  # Predict median house value for this sample


# Display results
print(f"Predicted Value: {predicted_value[0]:.2f}")  # Predicted house price
print(f"Actual Value: {y_test.iloc[0]:.2f}")         # Actual house price

print(f"Mean Squared Error: {mse:.2f}")              # Lower is better (0 = perfect)
print(f"R-squared Score: {r2:.2f}")                  # Closer to 1 means better model performance


Predicted Value: 0.51
Actual Value: 0.48
Mean Squared Error: 0.26
R-squared Score: 0.80


In [None]:
# MODEL RESULTS EXPLANATION — CALIFORNIA HOUSING RANDOM FOREST REGRESSION

# Predicted Value: 0.51
# The model estimated the median house value for one district to be 0.51 × $100,000 = $51,000.
# This is the model's prediction for that specific test data sample.

# Actual Value: 0.48
# The real (true) median house value for the same district is 0.48 × $100,000 = $48,000.
# The prediction is very close — only about $3,000 off — showing good accuracy on this example.

# Mean Squared Error (MSE): 0.26
# MSE measures the average squared difference between predicted and actual values.
# The lower this value, the better the model performance.
# Here, 0.26 indicates relatively small average prediction errors — a good sign.

# R-squared (R²) Score: 0.80
# R² measures how well the model explains the variation in house prices.
# It ranges from 0 to 1, where:
#   1.0 → perfect prediction
#   0.0 → no explanatory power
# An R² of 0.80 means the model explains 80% of the variation in housing prices — strong performance.

# Summary:
# - Your Random Forest Regressor captures most of the important patterns in the data.
# - It predicts house prices accurately with low error.
# - A model like this is reliable for regression tasks involving non-linear relationships between features.
