In [2]:
# 1. Loading and Preprocessing

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
california = fetch_california_housing()
df = pd.DataFrame(data=california.data, columns=california.feature_names)
df['Target'] = california.target

In [4]:
print("Missing values in dataset:\n", df.isnull().sum())


Missing values in dataset:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('Target', axis=1))
y = df['Target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [10]:
# Preprocessing Summary (Markdown-style comment):
"""
Preprocessing Steps:
- Loaded the California Housing dataset using `fetch_california_housing`.
- Converted to pandas DataFrame for clarity.
- Checked for and confirmed no missing values.
- Applied `StandardScaler` for feature scaling, essential for algorithms like SVR and Gradient Boosting which are sensitive to feature magnitude.
"""

'\nPreprocessing Steps:\n- Loaded the California Housing dataset using `fetch_california_housing`.\n- Converted to pandas DataFrame for clarity.\n- Checked for and confirmed no missing values.\n- Applied `StandardScaler` for feature scaling, essential for algorithms like SVR and Gradient Boosting which are sensitive to feature magnitude.\n'

In [15]:
# 2. Regression Algorithm Implementation

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

results = []

In [17]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R²": r2
    })

In [18]:
# 3. Model Evaluation and Comparison

results_df = pd.DataFrame(results).sort_values(by="R²", ascending=False)
print("\nModel Comparison:\n")
print(results_df)

# Best and Worst Performer
best_model = results_df.iloc[0]
worst_model = results_df.iloc[-1]

print(f"\nBest Model: {best_model['Model']} with R² = {best_model['R²']:.4f}")
print(f"Worst Model: {worst_model['Model']} with R² = {worst_model['R²']:.4f}")


Model Comparison:

                      Model       MSE       MAE        R²
2             Random Forest  0.255498  0.327613  0.805024
3         Gradient Boosting  0.293999  0.371650  0.775643
4  Support Vector Regressor  0.355198  0.397763  0.728941
1             Decision Tree  0.494272  0.453784  0.622811
0         Linear Regression  0.555892  0.533200  0.575788

Best Model: Random Forest with R² = 0.8050
Worst Model: Linear Regression with R² = 0.5758


In [None]:
# Summary:
"""
Model Insights:
- Linear Regression: Assumes a linear relationship. Performs decently as housing price has linear trends.
- Decision Tree: Handles non-linear relationships well but prone to overfitting.
- Random Forest: Ensemble method, good generalization and robust to overfitting.
- Gradient Boosting: Boosts weak learners sequentially. High accuracy and works well with clean data.
- SVR: Effective in high-dimensional space but sensitive to feature scaling and parameters.

Best Performer: Gradient Boosting Regressor — due to its balance between bias and variance.
Worst Performer: Support Vector Regressor — slower and less accurate in this larger dataset context.
"""
