In [7]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add target column (house prices)
df['MedHouseVal'] = data.target

# Show first few rows
print(df.head())



   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [9]:
# Check for missing values (important!)
print(df.isnull().sum())  # Good news: all should be 0

# Separate input (X) and output (y)
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Standardize the input features (important for some models like SVR)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


In [15]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


In [17]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [18]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)


In [21]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)


In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def print_scores(y_true, y_pred, model_name):
    print(f"----- {model_name} -----")
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("R2 Score:", r2_score(y_true, y_pred))
    print("\n")

# Call the function for all models
print_scores(y_test, y_pred_lr, "Linear Regression")
print_scores(y_test, y_pred_dt, "Decision Tree")
print_scores(y_test, y_pred_rf, "Random Forest")
print_scores(y_test, y_pred_gb, "Gradient Boosting")
print_scores(y_test, y_pred_svr, "SVR")


----- Linear Regression -----
MSE: 0.5558915986952441
MAE: 0.5332001304956565
R2 Score: 0.575787706032451


----- Decision Tree -----
MSE: 0.4942716777366763
MAE: 0.4537843265503876
R2 Score: 0.6228111330554302


----- Random Forest -----
MSE: 0.25549776668540763
MAE: 0.32761306601259704
R2 Score: 0.805024407701793


----- Gradient Boosting -----
MSE: 0.29399901242474274
MAE: 0.37165044848436773
R2 Score: 0.7756433164710084


----- SVR -----
MSE: 0.3551984619989429
MAE: 0.397763096343787
R2 Score: 0.7289407597956454




## 🔚 Conclusion

After applying five regression algorithms to the California Housing dataset:

- The **Random Forest Regressor** performed the best with an R2 score of **0.805**, the **lowest MSE** and **MAE**. This shows it can explain the most variation in house prices and makes smaller prediction errors.
  
- The **Linear Regression** model was the worst performer, with an R2 score of only **0.576**, suggesting that the linear assumption does not capture the complexity of the data well.

Overall, **ensemble models** like Random Forest and Gradient Boosting worked better on this dataset compared to simple models like Linear Regression or SVR.
