In [40]:
# Importing Required Libraries

import pandas as pd  # For Data Handling
import numpy as np
import matplotlib.pyplot as plt  # For Visualization
import seaborn as sns
from sklearn.preprocessing import StandardScaler  # For Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression  # Regression Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Loading and Preprocessing

In [13]:
# Loading the Dataset
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()


In [21]:
# Convert to pandas DataFrame

df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseValue'] = data.target # Adding the target column
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseValue
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [24]:
# Checking Missing values

df.isnull().sum()

MedInc           0
HouseAge         0
AveRooms         0
AveBedrms        0
Population       0
AveOccup         0
Latitude         0
Longitude        0
MedHouseValue    0
dtype: int64

In [28]:
# Checking Duplicate rows
duplicates = df.duplicated().sum()
print(f"Total Duplicate Rows: {duplicates}")

Total Duplicate Rows: 0


In [68]:
# Feature Scaling Using Standard Scaler
# To separate feature and target
x = df.drop('MedHouseValue', axis=1)
y = df['MedHouseValue']

scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)
# StandardScaler transforms data to have mean = 0 and standard deviation = 1.

# 2. Regression Algorithm Implementation

In [72]:
# Using Liner regression

# To split the data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model

lr = LinearRegression()
lr.fit(x_train, y_train)

# For prediction and evaluation
y_pred = lr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression Result:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolut Error: {mae:.2f}")
print(f"R-Squared: {r2:.2f}")

Linear Regression Result:
Mean Squared Error: 0.56
Mean Absolut Error: 0.53
R-Squared: 0.58


# What is Linear Regression? 
Linear Regression is one of the simplest and most commonly used algorithms in regression tasks. It models the relationship between one or more independent variables (features) and a dependent variable (target) by fitting a straight line (or hyperplane in multiple dimensions) to the data. The model learns the best values for these coefficients by minimizing the mean squared error between the predicted values and target values.

# Why is it used in the regression?
- The California Housing dataset is a regression problem, where the target is a continuous value (median house price), making Linear Regression a valid choice.
- It works well as a baseline model, helping to set a reference point for comparing more complex models.
- It is easy to interpret, and we can understand the impact of each feature on the price.
- The dataset features (like average income, number of rooms, etc.) often have linear relationships with housing prices, making Linear Regression potentially effective.

In [78]:
# Using Decision Tree Regressor
# Initialize the model
dt_model = DecisionTreeRegressor(random_state=42)

# To train the model
dt_model.fit(x_train, y_train)

# Predicting on test data
y_pred_dt = dt_model.predict(x_test)

# To Evaluate the model
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)


In [80]:
# Print the results
print("Decision Tree Regressor Results:")
print(f"Mean Squared Error: {mse_dt:.2f}")
print(f"Mean Absolute Error: {mae_dt:.2f}")
print(f"R-squared Score: {r2_dt:.2f}")

Decision Tree Regressor Results:
Mean Squared Error: 0.49
Mean Absolute Error: 0.45
R-squared Score: 0.62


# Decision Tree Regressor:
A Decision Tree Regressor splits the data into smaller and smaller subsets by asking yes/no questions based on feature values. It keeps doing this until it reaches a leaf node, where it assigns a prediction (average of target values in that node). It works non-linearly, unlike linear regression, so it's great at capturing complex patterns.

# Why It's Suitable?
-It does not require feature scaling, which simplifies preprocessing.
-It can automatically handle feature interactions and identify important features.
-Easy to visualize and interpret for small trees.
-Housing prices can depend on complex combinations of features. Decision Trees capture non-linear relationships well.

In [82]:
# Using Random Forest Regressor

# Defining the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# To train the model
rf_model.fit(x_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(x_test)

# To evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [83]:
# Print the results
print("Random Forest Regressor Results:")
print(f"Mean Squared Error: {mse_rf:.2f}")
print(f"Mean Absolute Error: {mae_rf:.2f}")
print(f"R-squared Score: {r2_rf:.2f}")

Random Forest Regressor Results:
Mean Squared Error: 0.26
Mean Absolute Error: 0.33
R-squared Score: 0.81


#  How Random Forest Regressor Works
A Random Forest Regressor is an ensemble model that builds many decision trees and averages their predictions. It creates multiple decision trees using random subsets of the data (both rows and columns). Each tree learns differently because of the randomness. For regression, it averages the predictions of all the trees to give the final result. This reduces the chance of overfitting that a single Decision Tree might face.

# Why It’s Suitable? 
-The dataset has complex, non-linear relationships — Random Forest can handle that easily.
-It’s robust against noise and overfitting due to averaging.
-It can handle a mix of features and capture important interactions.
-No need for scaling — it's scale-invariant.
-Very accurate and stable compared to individual models.

In [86]:
# Using Gradient Boosting Regressor

# Initialize the model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gb_model.fit(x_train, y_train)

# Predict on test data
y_pred_gb = gb_model.predict(x_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

In [88]:
# Print the results
print("Gradient Boosting Regressor Results:")
print(f"Mean Squared Error: {mse_gb:.2f}")
print(f"Mean Absolute Error: {mae_gb:.2f}")
print(f"R-squared Score: {r2_gb:.2f}")

Gradient Boosting Regressor Results:
Mean Squared Error: 0.29
Mean Absolute Error: 0.37
R-squared Score: 0.78


# Gradient Boosting Regressor
It starts with a simple model (usually a Decision Tree). Then it calculates the error (called residuals). It builds a new tree to predict the residuals (the mistakes). Repeats this process multiple times, and the final prediction is the sum of all previous corrections. This step-by-step correction is called boosting.

# Why It’s Suitable 
- It captures complex, non-linear relationships very well.
- Usually gives high accuracy for structured/tabular data like this one.
- Can handle outliers better than some other models.

In [90]:
# Using Support Vector Regressor (SVR)

# To define the model
svr_model = SVR(kernel='rbf')  # 'rbf' is the default kernel and good for non-linear data

# To train the model
svr_model.fit(x_train, y_train)

# Predicting on test data
y_pred_svr = svr_model.predict(x_test)

# Evaluate the model
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

In [91]:
# Print the results
print("Support Vector Regressor (SVR) Results:")
print(f"Mean Squared Error: {mse_svr:.2f}")
print(f"Mean Absolute Error: {mae_svr:.2f}")
print(f"R-squared Score: {r2_svr:.2f}")

Support Vector Regressor (SVR) Results:
Mean Squared Error: 0.36
Mean Absolute Error: 0.40
R-squared Score: 0.73


# Support Vector Regressor (SVR)
VR (Support Vector Regression) is the regression version of SVM (Support Vector Machine). It tries to find a hyperplane (or curve) that predicts continuous values. The goal is to fit the data within a margin of tolerance (epsilon) — instead of minimizing the error for every point, it ignores small errors and focuses on larger ones. Only data points outside this margin are used to define the model

# Why It’s Suitable? 
-If relationships between features and target are non-linear, SVR with RBF kernel can perform well.
-Works well on smaller datasets – this dataset is moderate in size, so it can handle it.
-Can model complex boundaries without overfitting (with proper tuning).

## Model Evaluation Summary

# How to Choose the Best?

1. R² Score: Higher is better → closer to 1.
2. MSE & MAE: Lower is better → less error.

In [94]:
# For better understanding creating a table with all model values

# Creating a dictionary with all the results
results = {
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting', 'SVR'],
    'MSE': [mse, mse_dt, mse_rf, mse_gb, mse_svr],
    'MAE': [mae, mae_dt, mae_rf, mae_gb, mae_svr],
    'R2 Score': [r2, r2_dt, r2_rf, r2_gb, r2_svr]
}

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Display the table
print("Model Performance-Comparison:")
print(results_df)

Model Performance-Comparison:
               Model       MSE       MAE  R2 Score
0  Linear Regression  0.555892  0.533200  0.575788
1      Decision Tree  0.494272  0.453784  0.622811
2      Random Forest  0.255498  0.327613  0.805024
3  Gradient Boosting  0.293999  0.371650  0.775643
4                SVR  0.355198  0.397763  0.728941


# Best-Performing Algorithm:
- Random Forest Regressor performed the best, achieving the lowest MSE and MAE, and the highest R² score (0.81).
- It works well for this dataset because it combines multiple decision trees to reduce variance and improve generalization, handling both linear and non-linear patterns effectively.

# Worst-Performing Algorithm:
-Linear Regression and Support Vector Regressor (SVR) had the lowest R² scores (~0.61–0.62) and higher error rates.
-Linear Regression assumes linearity, which may not fully capture complex patterns in housing data.
SVR underperformed likely due to sensitivity to hyperparameters and the dataset size, requiring careful tuning to be effective.