In [195]:
# https://www.youtube.com/watch?v=vh2smjQyhp8&ab_channel=machinelearrrning
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X = data.data
y = data.target
features = data.feature_names
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [134]:
df = pd.DataFrame(X, columns=features)
df['target'] = y
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [135]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X, y)

In [136]:
pred = lr.predict(X)
pred

array([4.13164983, 3.97660644, 3.67657094, ..., 0.17125141, 0.31910524,
       0.51580363])

In [137]:
pred_df = pd.DataFrame({
    'ground truth': y,
    'prediction': pred
})

pred_df

Unnamed: 0,ground truth,prediction
0,4.526,4.131650
1,3.585,3.976606
2,3.521,3.676571
3,3.413,3.241598
4,3.422,2.413587
...,...,...
20635,0.781,0.133656
20636,0.771,0.553528
20637,0.923,0.171251
20638,0.847,0.319105


In [138]:
#get difference between prediction and ground truth
pred_df['difference'] = pred_df['prediction'] - pred_df['ground truth']
pred_df

Unnamed: 0,ground truth,prediction,difference
0,4.526,4.131650,-0.394350
1,3.585,3.976606,0.391606
2,3.521,3.676571,0.155571
3,3.413,3.241598,-0.171402
4,3.422,2.413587,-1.008413
...,...,...,...
20635,0.781,0.133656,-0.647344
20636,0.771,0.553528,-0.217472
20637,0.923,0.171251,-0.751749
20638,0.847,0.319105,-0.527895


In [144]:
#wrong way of calculating average difference
pred_df["difference"].sum() / pred_df.shape[0]

np.float64(4.040179044031267e-15)

In [149]:
pred_df["abs"] = abs(pred_df["difference"])
pred_df

Unnamed: 0,ground truth,prediction,difference,abs
0,4.526,4.131650,-0.394350,0.394350
1,3.585,3.976606,0.391606,0.391606
2,3.521,3.676571,0.155571,0.155571
3,3.413,3.241598,-0.171402,0.171402
4,3.422,2.413587,-1.008413,1.008413
...,...,...,...,...
20635,0.781,0.133656,-0.647344,0.647344
20636,0.771,0.553528,-0.217472,0.217472
20637,0.923,0.171251,-0.751749,0.751749
20638,0.847,0.319105,-0.527895,0.527895


In [158]:
#Mean absolute error
print("MAE:", pred_df["abs"].mean())
#or 
from sklearn.metrics import mean_absolute_error
print("MAE:", mean_absolute_error(pred_df["ground truth"], pred_df["prediction"]))

MAE: 0.5311643817546465
MAE: 0.5311643817546465


In [165]:
# Mean squared error
pred_df["diffSquared"] = pred_df["difference"]**2
diffSquaredMean = pred_df["diffSquared"].mean()
print("MSE:", diffSquaredMean)
# or
from sklearn.metrics import mean_squared_error
print("MAE:", mean_squared_error(pred_df["ground truth"], pred_df["prediction"]))

MSE: 0.5243209861846071
MAE: 0.5243209861846071


In [166]:
pred_df

Unnamed: 0,ground truth,prediction,difference,abs,diffSquared
0,4.526,4.131650,-0.394350,0.394350,0.155512
1,3.585,3.976606,0.391606,0.391606,0.153356
2,3.521,3.676571,0.155571,0.155571,0.024202
3,3.413,3.241598,-0.171402,0.171402,0.029378
4,3.422,2.413587,-1.008413,1.008413,1.016896
...,...,...,...,...,...
20635,0.781,0.133656,-0.647344,0.647344,0.419055
20636,0.771,0.553528,-0.217472,0.217472,0.047294
20637,0.923,0.171251,-0.751749,0.751749,0.565126
20638,0.847,0.319105,-0.527895,0.527895,0.278673


In [169]:
# RMSE - Root mean squared error
diffSquaredMean**0.5

np.float64(0.7241001216576387)

In [172]:
# find constant
pred_df["constant"] = pred_df["ground truth"].mean()
pred_df

Unnamed: 0,ground truth,prediction,difference,abs,diffSquared,constant
0,4.526,4.131650,-0.394350,0.394350,0.155512,2.068558
1,3.585,3.976606,0.391606,0.391606,0.153356,2.068558
2,3.521,3.676571,0.155571,0.155571,0.024202,2.068558
3,3.413,3.241598,-0.171402,0.171402,0.029378,2.068558
4,3.422,2.413587,-1.008413,1.008413,1.016896,2.068558
...,...,...,...,...,...,...
20635,0.781,0.133656,-0.647344,0.647344,0.419055,2.068558
20636,0.771,0.553528,-0.217472,0.217472,0.047294,2.068558
20637,0.923,0.171251,-0.751749,0.751749,0.565126,2.068558
20638,0.847,0.319105,-0.527895,0.527895,0.278673,2.068558


In [176]:
#find MSE for const
mseConst = mean_squared_error(pred_df["ground truth"], pred_df["constant"])
mseConst

np.float64(1.3315503000818076)

In [182]:
#find R2 https://en.wikipedia.org/wiki/Coefficient_of_determination
print("R2: ", 1 - diffSquaredMean / mseConst)
# or 
from sklearn.metrics import r2_score
print("R2: ", r2_score(pred_df["ground truth"], pred_df["prediction"]))

R2:  0.6062326851998051
R2:  0.6062326851998051


In [189]:
y_true = [0, 1, 2, 3, 4]
y_pred = [10, -20, -30, 40, -50]


df = pd.DataFrame({
    'y_true': y_true,
    'y_pred': y_pred
})

# find dispersion
dispersion = ((df["y_true"] - df["y_true"].mean())**2).mean()
print("Dispersion:", dispersion)
#find MSE
mse = ((df["y_true"] - df["y_pred"])**2).mean()
print("MSE:", mse)

Dispersion: 2.0
MSE: 1170.0


In [193]:
1 - mse / dispersion

np.float64(-584.0)

In [194]:
r2_score(y_true, y_pred)

-584.0