In [3]:

from sklearn.datasets import fetch_california_housing


In [11]:

california_housing = fetch_california_housing(as_frame=True)
data = california_housing.data
target = california_housing.target

data.head(), target.head()




(   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 
    Longitude  
 0    -122.23  
 1    -122.22  
 2    -122.24  
 3    -122.25  
 4    -122.25  ,
 0    4.526
 1    3.585
 2    3.521
 3    3.413
 4    3.422
 Name: MedHouseVal, dtype: float64)

In [15]:

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer




In [17]:
columns = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms",
    "Population", "AveOccup", "Latitude", "Longitude"
]
import numpy as np

np.random.seed(42)
data_demo = pd.DataFrame(
    np.random.rand(100, len(columns)) * 100,  
    columns=columns
)

target_demo = pd.Series(np.random.rand(100) * 5, name="MedianHouseValue")

missing_data = data_demo.isnull().sum()

imputer = SimpleImputer(strategy="mean")
data_cleaned = pd.DataFrame(imputer.fit_transform(data_demo), columns=data_demo.columns)

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_cleaned), columns=data_cleaned.columns)

data_scaled.head(), missing_data

(     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0 -0.320886  1.533118  1.035234   0.202727   -1.086062 -1.164789 -1.491266   
 1  0.416124  0.666156 -1.449925   1.446933    1.169332 -0.971025 -1.072888   
 2 -0.549554  0.011165 -0.012924  -0.827586    0.433821 -1.221534 -0.699889   
 3 -0.055683  0.941648 -0.824315  -0.080210    0.369008 -1.541503  0.366501   
 4 -1.327601  1.526584  1.851400   0.905643   -0.590603 -1.365356  0.625789   
 
    Longitude  
 0   1.223034  
 1  -1.271519  
 2  -0.603072  
 3  -1.318578  
 4  -0.333473  ,
 MedInc        0
 HouseAge      0
 AveRooms      0
 AveBedrms     0
 Population    0
 AveOccup      0
 Latitude      0
 Longitude     0
 dtype: int64)

In [19]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error




In [21]:

X_train, X_test, y_train, y_test = train_test_split(
    data_scaled, target_demo, test_size=0.2, random_state=42
)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

results

{'Linear Regression': 2.2636633900516943,
 'Decision Tree Regressor': 3.3926284958553254,
 'Random Forest Regressor': 2.1779378611614177,
 'Gradient Boosting Regressor': 2.0584577622895797,
 'Support Vector Regressor': 1.7650153658143346}

In [None]:
#this values indicates the average error of the models predictions. also it indicates that models predictions are closer to actual values
#support vector regression has the lowest error ,that is it is the best model here.
#decision  tree regression has the highest error ,that is it is the worst model.

In [23]:

from sklearn.metrics import mean_absolute_error, r2_score


In [25]:

evaluation_metrics = {
    "Model": [],
    "MSE": [],
    "MAE": [],
    "R2": []
}


for name, model in models.items():

    y_pred = model.predict(X_test)
        


In [27]:

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    evaluation_metrics["Model"].append(name)
    evaluation_metrics["MSE"].append(mse)
    evaluation_metrics["MAE"].append(mae)
    evaluation_metrics["R2"].append(r2)


In [31]:
# Convert results into a DataFrame for comparison
evaluation_df = pd.DataFrame(evaluation_metrics)
evaluation_df.sort_values(by="R2", ascending=False)


Unnamed: 0,Model,MSE,MAE,R2
0,Support Vector Regressor,1.765015,1.154086,0.020597


In [None]:
# low R2 value indicates that the model does not explain much varience in the target variable