In [1]:
import numpy as np
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/gedeck/practical-statistics-for-data-scientists/master/data/house_sales.csv'
df = pd.read_csv(url, sep='\t')
df

Unnamed: 0,DocumentDate,SalePrice,PropertyID,PropertyType,ym,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,...,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
1,2014-09-16,280000,1000102,Multiplex,2014-09-01,405100,0.930836,300805.0,2,9373,...,3.00,6,7,1991,0,0,70000,229000,98002,False
2,2006-06-16,1000000,1200013,Single Family,2006-06-01,404400,0.929228,1076162.0,1,20156,...,3.75,4,10,2005,0,0,203000,590000,98166,True
3,2007-01-29,745000,1200019,Single Family,2007-01-01,425600,0.977941,761805.0,1,26036,...,1.75,4,8,1947,0,0,183000,275000,98166,False
4,2008-02-25,425000,2800016,Single Family,2008-02-01,418400,0.961397,442065.0,1,8618,...,3.75,5,7,1966,0,0,104000,229000,98168,False
5,2013-03-29,240000,2800024,Single Family,2013-03-01,351600,0.807904,297065.0,1,8620,...,1.75,4,7,1948,0,0,104000,205000,98168,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27057,2011-04-08,325000,9842300710,Single Family,2011-04-01,318700,0.732307,443803.0,1,5468,...,1.75,3,7,1951,0,0,201000,172000,98126,False
27058,2007-09-28,1580000,9845500010,Single Family,2007-09-01,433500,0.996094,1586196.0,1,23914,...,4.50,4,11,2000,0,1,703000,951000,98040,False
27061,2012-07-09,165000,9899200010,Single Family,2012-07-01,325300,0.747472,220744.0,1,11170,...,1.00,4,6,1971,0,0,92000,130000,98055,False
27062,2006-05-26,315000,9900000355,Single Family,2006-05-01,400600,0.920496,342207.0,1,6223,...,2.00,3,7,1939,0,0,103000,212000,98166,False


In [3]:
predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
outcome = 'AdjSalePrice'
x = df[predictors]
y = df[outcome]
x.shape, y.shape

((22687, 5), (22687,))

In [4]:
from sklearn.linear_model import LinearRegression
house_lm = LinearRegression()
house_lm.fit(x, y);

In [5]:
print(f'Intercept: {house_lm.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(predictors, house_lm.coef_):
    print(f'    {name:<15s}: {coef:>12.5f}')

Intercept: -521871.368
Coefficients:
    SqFtTotLiving  :    228.83060
    SqFtLot        :     -0.06047
    Bathrooms      : -19442.84040
    Bedrooms       : -47769.95519
    BldgGrade      : 106106.96308


In [6]:
y_pred = house_lm.predict(x)

$$
R^2 = \frac{\text{Variance Explained by Model}}{\text{Total Variance}}=1-\frac{SSE}{SST}
$$
where
$$\begin{array}{lll}
SST&=&\sum(y_i-\bar{y})^2\\
SSE&=&\sum(y_i-(\hat{\alpha}+\hat{\beta}x_i))^2\\
\end{array}$$

[wiki](https://en.wikipedia.org/wiki/Coefficient_of_determination)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
r2 = r2_score(y, y_pred)
print(f'r2   : {r2:>6.4f}')

$$
\begin{array}{lll}
\mbox{MAE}&&\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|\\
\mbox{MSE}&&\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2\\
\mbox{RMSE}&&\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}
\end{array}
$$

In [7]:
from sklearn.metrics import r2_score, mean_squared_error
RMSE = np.sqrt(mean_squared_error(y, y_pred))
print(f'RMSE : {RMSE:>6.0f}')

RMSE : 261220
r2   : 0.5406


# Reference

[practical-statistics-for-data-scientists](https://github.com/gedeck/practical-statistics-for-data-scientists)