In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import datasets

In [2]:
data = sklearn.datasets.fetch_california_housing(as_frame=True).data
target = sklearn.datasets.fetch_california_housing(as_frame=True).target

In [3]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
target.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64

# Statsmodels

In [5]:
import statsmodels.api as sm

In [6]:
modelo = sm.OLS(target,data)
res = modelo.fit()

In [7]:
res.summary()

0,1,2,3
Dep. Variable:,MedHouseVal,R-squared (uncentered):,0.892
Model:,OLS,Adj. R-squared (uncentered):,0.892
Method:,Least Squares,F-statistic:,21370.0
Date:,"Sun, 29 Oct 2023",Prob (F-statistic):,0.0
Time:,21:53:01,Log-Likelihood:,-24087.0
No. Observations:,20640,AIC:,48190.0
Df Residuals:,20632,BIC:,48250.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.5135,0.004,120.594,0.000,0.505,0.522
HouseAge,0.0157,0.000,33.727,0.000,0.015,0.017
AveRooms,-0.1825,0.006,-29.673,0.000,-0.195,-0.170
AveBedrms,0.8651,0.030,28.927,0.000,0.806,0.924
Population,7.792e-06,5.09e-06,1.530,0.126,-2.19e-06,1.78e-05
AveOccup,-0.0047,0.001,-8.987,0.000,-0.006,-0.004
Latitude,-0.0639,0.004,-17.826,0.000,-0.071,-0.057
Longitude,-0.0164,0.001,-14.381,0.000,-0.019,-0.014

0,1,2,3
Omnibus:,4353.392,Durbin-Watson:,0.909
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.489
Skew:,1.069,Prob(JB):,0.0
Kurtosis:,6.436,Cond. No.,10300.0


In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data)
normalized_data = pd.DataFrame(normalized_data, columns = data.columns)
normalized_data["intercept"] = 1

modelo = sm.OLS(target,normalized_data)
res = modelo.fit()
res.summary()

0,1,2,3
Dep. Variable:,MedHouseVal,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,3970.0
Date:,"Sun, 29 Oct 2023",Prob (F-statistic):,0.0
Time:,21:53:01,Log-Likelihood:,-22624.0
No. Observations:,20640,AIC:,45270.0
Df Residuals:,20631,BIC:,45340.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.8296,0.008,104.054,0.000,0.814,0.845
HouseAge,0.1188,0.006,21.143,0.000,0.108,0.130
AveRooms,-0.2655,0.015,-18.235,0.000,-0.294,-0.237
AveBedrms,0.3057,0.013,22.928,0.000,0.280,0.332
Population,-0.0045,0.005,-0.837,0.402,-0.015,0.006
AveOccup,-0.0393,0.005,-7.769,0.000,-0.049,-0.029
Latitude,-0.8999,0.015,-58.541,0.000,-0.930,-0.870
Longitude,-0.8705,0.015,-57.682,0.000,-0.900,-0.841
intercept,2.0686,0.005,410.326,0.000,2.059,2.078

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,6.67


# Scikit-learn

In [10]:
data.shape

(20640, 8)

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Xtrain, Xval, Ytrain, Yval = train_test_split(data,target, test_size= 0.3, random_state=0)
print(Xtrain.shape, Xval.shape, Ytrain.shape, Yval.shape)

modelo = LinearRegression()
modelo.fit(Xtrain, Ytrain)

p = modelo.predict(Xval)

(14448, 8) (6192, 8) (14448,) (6192,)


In [12]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Yval,p))

0.7369864089681186

In [13]:
modelo.coef_

array([ 4.46773975e-01,  9.18409990e-03, -1.18116775e-01,  6.42290879e-01,
       -9.37026507e-06, -4.08535934e-03, -4.09023312e-01, -4.23419564e-01])

# Scikit-learn standardizado

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Xtrain, Xval, Ytrain, Yval = train_test_split(data,target, test_size= 0.3, random_state=0)
print(Xtrain.shape, Xval.shape, Ytrain.shape, Yval.shape)

scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xval_scaled = scaler.fit_transform(Xval)

modelo = LinearRegression()
modelo.fit(Xtrain_scaled, Ytrain)

p = modelo.predict(Xval_scaled)

(14448, 8) (6192, 8) (14448,) (6192,)


In [15]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Yval,p))

0.7366446723197566

In [16]:
modelo.coef_

array([ 0.84489085,  0.1156853 , -0.27019687,  0.29078838, -0.0107715 ,
       -0.02805796, -0.8753289 , -0.84959869])