In [75]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import SGDRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler


# Load the California Housing dataset
dataset = fetch_california_housing ()
df=pd. DataFrame(dataset.data, columns=dataset.feature_names)
df[ 'HousingPrice']=dataset.target
print(df.head())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  HousingPrice  
0    -122.23         4.526  
1    -122.22         3.585  
2    -122.24         3.521  
3    -122.25         3.413  
4    -122.25         3.422  


In [92]:
# Use the first 3 features as inputs
X = df.drop(columns=['AveOccup', 'HousingPrice'])#data[:, :3] # Features: 'MedInc', 'HouseAge', 'AveRooms '
# Use 'MedHouseVal' and 'AveOccup' as output variables
Y = df[['AveOccup', 'HousingPrice']]#np. column_stack((data.target, data.data[:, 6])) # Targets: 'MedHouseVal', 'AveOccup'
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Scale the features and target variables
scaler_X = StandardScaler()
scaler_Y = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)
Y_train = scaler_Y.fit_transform(Y_train)


#no tranformation required since it is only used for verification
#Y_test = scaler_Y.transform(Y_test)



In [77]:
# Initialize the SGDRegressor
sgd = SGDRegressor(max_iter=1000, tol=1e-3)
# Use MultiOutputRegressor to handle multiple output variables
multi_output_sgd = MultiOutputRegressor(sgd)
# Train the model
multi_output_sgd.fit(X_train, Y_train)
# Predict on the test data


In [90]:
Y_pred = multi_output_sgd.predict(X_test)
# Inverse transform the predictions to get them back to the original scale
Y_pred = scaler_Y.inverse_transform(Y_pred)

#Y_test = scaler_Y.inverse_transform(Y_test)  #not required since Y_test didm't transform

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(Y_test, Y_pred)
print ("Mean Squared Error:", mse,"\n\n")
#Optionally, print some predictions 
#print("\nPredictions: \n", Y_pred[:5]) # Print first 5 predictions

print("Actual test values:\n",Y_test)
print("Predicted test value\n",Y_pred)

Mean Squared Error: 1.9591635399451306 


Actual test values:
        AveOccup  HousingPrice
20046  3.877437       0.47700
3024   2.679795       0.45800
15663  1.360332       5.00001
20484  3.444444       2.18600
9814   2.483645       2.78000
...         ...           ...
15362  2.988938       2.63300
16623  2.340426       2.66800
18086  2.790493       5.00001
2144   2.588608       0.72300
3665   3.729911       1.51500

[4128 rows x 2 columns]
Predicted test value
 [[3.28344128 0.74165914]
 [2.76890557 1.77882556]
 [3.60564421 2.70637856]
 ...
 [2.80227215 4.42976508]
 [3.32356765 1.19689101]
 [2.98660151 2.02029349]]
