In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# =======================
# 1️⃣ Load California Dataset
# =======================
california = fetch_california_housing(as_frame=True)
df = california.frame

# Add the target column manually
df['median_house_value'] = df['MedHouseVal']

# =======================
# 2️⃣ Display first few rows
# =======================
print("Sample data:\n", df.head(), "\n")

# =======================
# 3️⃣ Select features
# =======================
# Original dataset doesn’t contain 'total_bedrooms' or 'ocean_proximity',
# so let’s assume we’re working with available numeric features.

# Features available in sklearn’s dataset:
# ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

# We'll rename columns to match your requested feature names for clarity:
df = df.rename(columns={
    'Longitude': 'longitude',
    'Latitude': 'latitude',
    'HouseAge': 'housing_median_age',
    'AveRooms': 'total_rooms',
    'AveBedrms': 'total_bedrooms',
    'Population': 'population',
    'MedInc': 'median_income'
})

# Select columns similar to your list
features = ['longitude', 'latitude', 'housing_median_age',
            'total_rooms', 'total_bedrooms', 'population', 'median_income']

X = df[features]
y = df['median_house_value']

# =======================
# 4️⃣ Split into train and test sets
# =======================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =======================
# 5️⃣ Train Linear Regression model
# =======================
model = LinearRegression()
model.fit(X_train, y_train)

# =======================
# 6️⃣ Make predictions
# =======================
y_pred = model.predict(X_test)

# =======================
# 7️⃣ Evaluate model
# =======================
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"R² Score: {r2:.3f}")


Sample data:
    MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  median_house_value  
0    -122.23        4.526               4.526  
1    -122.22        3.585               3.585  
2    -122.24        3.521               3.521  
3    -122.25        3.413               3.413  
4    -122.25        3.422               3.422   

Root Mean Squared Error (RMSE): 0.746
R² Score: 0.575
