In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Load dataset
housing = fetch_california_housing()
df_x = pd.DataFrame(housing.data, columns=housing.feature_names)
df_y = pd.DataFrame(housing.target, columns=["Target"])

# Split dataset (67% train, 33% test)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42)

# Train Linear Regression model
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)

# Coefficients
print("Coefficients:", reg.coef_)

# Predict on test data
y_pred = reg.predict(x_test)
print("Predictions:", y_pred)

# Compare prediction vs actual
print("Predicted:", y_pred[2])
print("Actual:", y_test.iloc[2])

# Model evaluation - Mean Squared Error
print("MSE (numpy):", np.mean((y_pred - y_test.values.flatten()) ** 2))
print("MSE (sklearn):", mean_squared_error(y_test, y_pred))


Coefficients: [[ 4.44870466e-01  9.55004561e-03 -1.21991503e-01  7.79144696e-01
  -7.68990808e-08 -3.29948505e-03 -4.19131153e-01 -4.34103468e-01]]
Predictions: [[0.72563462]
 [1.76650223]
 [2.70545812]
 ...
 [1.25803135]
 [1.66673014]
 [2.25826279]]
Predicted: [2.70545812]
Actual: Target    5.00001
Name: 15663, dtype: float64
MSE (numpy): 2.1456921610957296
MSE (sklearn): 0.5369686543372468
