# California Housing Price Prediction (No-Terminal Notebook)

Run **Cell → Run All**.

In [None]:

# 1) Imports & setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import joblib

OUT_DIR = Path("models")
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = OUT_DIR / "rf_cali_housing.joblib"

print("Notebook working directory:", Path.cwd())
print("Model will be saved to:", MODEL_PATH)


In [None]:

# 2) Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame.copy()
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

print("Features:", list(X.columns))
print("Shapes -> X:", X.shape, "| y:", y.shape)
X.head()


In [None]:

# 3) Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


In [None]:

# 4) Pipeline: scale numeric + RandomForest
numeric_features = list(X.columns)
pre = ColumnTransformer([("num", StandardScaler(), numeric_features)], remainder="drop")
model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
pipe = Pipeline([("pre", pre), ("model", model)])

pipe.fit(X_train, y_train)
joblib.dump(pipe, MODEL_PATH)
print("✅ Trained and saved model to:", MODEL_PATH)


In [None]:

# 5) Evaluate
preds = pipe.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

print(f"MAE : {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R^2 : {r2:.3f}")


In [None]:

# 6) Feature importances
rf = pipe.named_steps["model"]
importances = rf.feature_importances_
import pandas as pd
fi = pd.DataFrame({"feature": numeric_features, "importance": importances}).sort_values("importance", ascending=False)

print("Top features:\n", fi.head(10))

plt.figure(figsize=(8,5))
plt.barh(fi["feature"], fi["importance"])
plt.gca().invert_yaxis()
plt.title("Feature Importances (RandomForest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:

# 7) Quick prediction helper
# Feature order: ['MedInc','HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']

def predict_price(medinc, house_age, ave_rooms, ave_bedrms, population, ave_occup, latitude, longitude):
    arr = np.array([[medinc, house_age, ave_rooms, ave_bedrms, population, ave_occup, latitude, longitude]])
    model = joblib.load(MODEL_PATH)
    return model.predict(arr)[0] * 100000  # convert 100k -> dollars

example = predict_price(5.0, 25, 6.0, 1.0, 1000, 3.0, 34.2, -118.3)
print(f"Example predicted price: ${example:,.0f}")
