In [1]:
from sklearn.datasets import fetch_california_housing

In [3]:
df = fetch_california_housing(as_frame=True)

In [4]:
data = df.frame

In [5]:
data.isna().sum().sum()

0

In [7]:
data.duplicated().sum().sum()

0

In [10]:
X = data.drop(columns=df.target_names)
y = data[df.target_names]

In [9]:
X.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [11]:
y.head(5)

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.2, random_state = 42
)

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge

In [21]:
OLS = Pipeline([
    ("Scaler", StandardScaler()),
    ("LinReg", LinearRegression())
])

In [23]:
Ridge = Pipeline([
    ("Scaler", StandardScaler()),
    ("Ridge", Ridge(alpha=100))
])

In [31]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
pd.set_option("display.float_format", "{:.4f}".format)

In [38]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):

    model.fit(X_train, y_train) 

    y_test_pred = model.predict(X_test)
    
    #Testing Metrics
    sse_test = np.sum((y_test.to_numpy() - y_test_pred)**2)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)
    
    results = pd.DataFrame([
        {"Model":name, "SSE":sse_test, "MSE":mse_test, "RMSE": rmse_test, "R^2": r2_test}
    ]).set_index(["Model"])
    
    return results;

In [39]:
evaluate_model("OLS", OLS, X_train, y_train, X_test,y_test)

Unnamed: 0_level_0,SSE,MSE,RMSE,R^2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OLS,2294.7205,0.5559,0.7456,0.5758


In [40]:
evaluate_model("Ridge", Ridge, X_train, y_train, X_test,y_test)

Unnamed: 0_level_0,SSE,MSE,RMSE,R^2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ridge,36097087.8082,0.5533,0.7438,0.5778


In [41]:
pd.concat([
    evaluate_model("OLS", OLS, X_train, y_train, X_test,y_test),
    evaluate_model("Ridge", Ridge, X_train, y_train, X_test,y_test)
])

Unnamed: 0_level_0,SSE,MSE,RMSE,R^2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OLS,2294.7205,0.5559,0.7456,0.5758
Ridge,36097087.8082,0.5533,0.7438,0.5778
