In [13]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [11]:
# Step 2: Load the California Housing dataset

housing = fetch_california_housing()


In [15]:
# Step 3: Convert to pandas DataFrame

df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal'] = housing.target  # Adding the target column


In [23]:
# Step 4: Check for missing values

missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)



Missing values in each column:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [None]:
#there is no missing values in california dataset

In [17]:
# Step 5: Feature Scaling (Standardization)

scaler = StandardScaler()
features = df.drop('MedHouseVal', axis=1)
scaled_features = scaler.fit_transform(features)

In [19]:
# Create a new DataFrame for scaled features
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
scaled_df['MedHouseVal'] = df['MedHouseVal'].values  # Append the target back

In [25]:
scaled_df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835,4.526
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844,3.585
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827,3.521
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818,3.413
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818,3.422


In [None]:
# 1.     explanations for, Loading the data set
# fetch_california_housing() retrieves a clean and well-structured dataset from sklearn.datasets.


# 2.    conversion tyo dataframe
# Easier to handle using pandas because it offers powerful functions for data exploration, visualization, and manipulation.

#3.    handling the missing values
# checking the missing value is the best practice.

# feature scaling standardization
# Many machine learning algorithms (e.g., linear regression, k-NN, SVM) assume features are on the same scale.
#Standardization ((x - mean) / std) centers features around zero with unit variance, improving model performance and convergence.

In [None]:
# Qn 2

In [27]:
# Step 1: Import regression models and metrics

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [29]:
# Step 2: Split the data

X = scaled_df.drop('MedHouseVal', axis=1)
y = scaled_df['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
# Step 3: Define models

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector Regressor': SVR()
}

In [33]:
# Step 4: Train and evaluate each model

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"\n{name}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R^2 Score: {r2:.4f}")


Linear Regression
Mean Squared Error: 0.5559
R^2 Score: 0.5758

Decision Tree
Mean Squared Error: 0.4943
R^2 Score: 0.6228

Random Forest
Mean Squared Error: 0.2555
R^2 Score: 0.8050

Gradient Boosting
Mean Squared Error: 0.2940
R^2 Score: 0.7756

Support Vector Regressor
Mean Squared Error: 0.3552
R^2 Score: 0.7289


In [35]:
# EXPLANATIONS
#  Linear Regression
#How it works: Fits a straight line that minimizes the sum of squared differences between actual and predicted values.

#Suitability: Good baseline for regression tasks. Simple and interpretable. May underperform on complex relationships.


#Decision Tree Regressor
#How it works: Splits the data into regions using decision rules (based on feature thresholds), fitting a constant value in each region.

#Suitability: Handles non-linear relationships and doesn’t require feature scaling. Prone to overfitting on small datasets but works well here with tuning.


#Random Forest Regressor
#How it works: An ensemble of decision trees trained on different random subsets of the data. Final prediction is the average of all tree predictions.

#Suitability: Reduces overfitting, handles non-linearities and interactions well. Great general-purpose model for structured/tabular data like housing prices.


# Gradient Boosting Regressor
#How it works: Builds an ensemble of trees sequentially, where each new tree tries to correct the errors of the previous one.

#Suitability: More accurate than random forests in many cases but also more sensitive to parameters. Ideal for capturing complex patterns.


# Support Vector Regressor (SVR)
#How it works: Tries to fit the best line (or curve) such that most data points lie within a certain margin (epsilon). Uses kernel tricks for non-linearity.

#Suitability: Works well with small-to-medium datasets and high-dimensional feature spaces. Needs feature scaling, which we've already done.





In [None]:
# Qn 3

In [37]:
from sklearn.metrics import mean_absolute_error

In [45]:
# Initialize results dictionary
results = {}

# Evaluate all models
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    results[name] = {'MSE': mse, 'MAE': mae, 'R²': r2}
    print(f"\n{name}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R² Score: {r2:.4f}")


Linear Regression
Mean Squared Error (MSE): 0.5559
Mean Absolute Error (MAE): 0.5332
R² Score: 0.5758

Decision Tree
Mean Squared Error (MSE): 0.4943
Mean Absolute Error (MAE): 0.4538
R² Score: 0.6228

Random Forest
Mean Squared Error (MSE): 0.2555
Mean Absolute Error (MAE): 0.3276
R² Score: 0.8050

Gradient Boosting
Mean Squared Error (MSE): 0.2940
Mean Absolute Error (MAE): 0.3717
R² Score: 0.7756

Support Vector Regressor
Mean Squared Error (MSE): 0.3552
Mean Absolute Error (MAE): 0.3978
R² Score: 0.7289


In [47]:
# Best-Performing Algorithm: Random Forest Regressor
# Justification:
#  Achieves the lowest MSE and MAE, and highest R² score.

 # Robust to overfitting due to ensemble averaging.

# Captures complex, non-linear relationships in the housing data effectively.



 # Worst-Performing Algorithm: Linear Regression
# Reasoning:
# Highest MSE and lowest R² score.

# Assumes a linear relationship between features and target, which is too simplistic for housing data.

# Doesn’t capture interactions or non-linearities well.

