In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Step 1: Load the California Housing dataset
housing = fetch_california_housing()

# Convert to DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['PRICE'] = housing.target  # Add the target variable (house price)

# Step 2: Split the data into features (X) and target (y)
X = df.drop('PRICE', axis=1)  # Features
y = df['PRICE']  # Target (House price)

# Step 3: Apply Log Transformation to the target variable (y)
y_log = np.log1p(y)  # log1p is log(1 + x), which avoids issues with zero values

# Step 4: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Step 5: Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  # Use the same scaler for the test set

# Step 6: Train the Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred_log = rf_model.predict(X_test)

# Step 8: Reverse the Log Transformation
y_pred = np.expm1(y_pred_log)  # expm1 is the inverse of log1p

# Step 9: Evaluate the Model
mse = mean_squared_error(np.expm1(y_test), y_pred)  # Reverse log for true values also
r2 = r2_score(np.expm1(y_test), y_pred)

print(f"Random Forest Mean Squared Error (MSE): {mse}")
print(f"Random Forest R-squared (RÂ²) Score: {r2}")

# Step 10: Visualize the predictions vs actual values
plt.scatter(np.expm1(y_test), y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Random Forest Actual vs Predicted House Prices')
plt.show()

# Step 11: Example of making predictions for new data
new_data = np.array([[1.1, 4.0, 1.0, 0.0, 0.0, 5.0, 1000, 1]])  # Example new data point
new_data_scaled = scaler.transform(new_data)
predicted_price = rf_model.predict(new_data_scaled)
predicted_price = np.expm1(predicted_price)  # Reverse log transformation
print(f"Predicted House Price for new data: {predicted_price[0]*100000}")
