# 🏠 California Housing Price Prediction
Using Linear Regression and Random Forest

In [None]:
# 📦 Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# 📥 Load data
data = fetch_california_housing(as_frame=True)
df = data.data.copy()
df['target'] = data.target
df.head()


## 🔍 EDA - Heatmap & Distribution

In [None]:
# Check nulls
df.isnull().sum()


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Distribution of target
sns.histplot(df['target'], bins=40, kde=True)
plt.title("Distribution of House Values")
plt.show()


## 📊 Linear Regression Model

In [None]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print(f"Linear Regression R²: {r2_score(y_test, y_pred_lr):.2f}")
print(f"Linear Regression RMSE: {mean_squared_error(y_test, y_pred_lr, squared=False):.2f}")


In [None]:
# Linear regression actual vs predicted
sns.scatterplot(x=y_test, y=y_pred_lr, alpha=0.4)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Linear Regression: Actual vs Predicted")
plt.show()


## 🌲 Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print(f"Random Forest R²: {r2_score(y_test, y_pred_rf):.2f}")
print(f"Random Forest RMSE: {mean_squared_error(y_test, y_pred_rf, squared=False):.2f}")


In [None]:
# RF actual vs predicted
sns.scatterplot(x=y_test, y=y_pred_rf, alpha=0.4)
plt.plot([0, 5], [0, 5], 'r--')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Random Forest: Actual vs Predicted")
plt.show()


In [None]:
# Feature importance
importances = rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title("Feature Importance from Random Forest")
plt.show()


## ✅ Summary
- Random Forest outperformed Linear Regression
- `MedInc` was the most important feature
- Predictions were better aligned with actual values
- Further improvement can be done via hyperparameter tuning and advanced feature engineering.