In [66]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.metrics import r2_score

In [67]:
data_set = fetch_california_housing()


features = pd.DataFrame(data_set.data, columns=data_set.feature_names)

target = pd.Series(data_set.target)

features.head(), target.head(), features.columns

(   MedInc  HouseAge  AveRooms  ...  AveOccup  Latitude  Longitude
 0  8.3252      41.0  6.984127  ...  2.555556     37.88    -122.23
 1  8.3014      21.0  6.238137  ...  2.109842     37.86    -122.22
 2  7.2574      52.0  8.288136  ...  2.802260     37.85    -122.24
 3  5.6431      52.0  5.817352  ...  2.547945     37.85    -122.25
 4  3.8462      52.0  6.281853  ...  2.181467     37.85    -122.25
 
 [5 rows x 8 columns],
 0    4.526
 1    3.585
 2    3.521
 3    3.413
 4    3.422
 dtype: float64,
 Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
        'Latitude', 'Longitude'],
       dtype='object'))

In [68]:
# Feature sellection.
features.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0


In [69]:
df = pd.concat([features, target.rename("target")], axis=1)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [70]:
df.corr()["target"]

MedInc        0.688075
HouseAge      0.105623
AveRooms      0.151948
AveBedrms    -0.046701
Population   -0.024650
AveOccup     -0.023737
Latitude     -0.144160
Longitude    -0.045967
target        1.000000
Name: target, dtype: float64

In [87]:
# select the independent and the dependent feature to use for the regression.
x = df.iloc[:, [0, 1, 2, 3, 4, -3]].values
y = df.iloc[:, -1].values
x[:1], y[:5]

(array([[  8.3252    ,  41.        ,   6.98412698,   1.02380952,
         322.        ,  37.88      ]]),
 array([4.526, 3.585, 3.521, 3.413, 3.422]))

In [88]:
# Split the data.
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)


In [89]:
# Train the model.
model = LinearRegression()
model.fit(x_train, y_train)

In [90]:
x_test

array([[5.27830000e+00, 1.20000000e+01, 4.46601942e+00, 9.80582524e-01,
        4.05000000e+02, 3.41600000e+01],
       [3.97220000e+00, 2.70000000e+01, 4.61927145e+00, 1.09635723e+00,
        1.87700000e+03, 3.75500000e+01],
       [4.50940000e+00, 1.20000000e+01, 4.42638037e+00, 1.09202454e+00,
        1.91300000e+03, 3.36700000e+01],
       ...,
       [3.97730000e+00, 3.30000000e+01, 4.71662763e+00, 9.60187354e-01,
        1.35400000e+03, 3.79800000e+01],
       [9.33700000e-01, 4.50000000e+01, 4.71232877e+00, 1.13972603e+00,
        1.38200000e+03, 3.39800000e+01],
       [7.83750000e+00, 4.00000000e+01, 6.59375000e+00, 1.01785714e+00,
        6.09000000e+02, 3.42100000e+01]])

In [93]:
y_pred = model.predict(x_test)

In [94]:
error = y_test - y_pred
pd.concat([pd.DataFrame(y_pred),pd.DataFrame(y_test), pd.DataFrame(error)], axis=1)

Unnamed: 0,0,0.1,0.2
0,2.641693,1.67400,-0.967693
1,2.194279,3.54100,1.346721
2,2.386370,1.60200,-0.784370
3,1.833158,1.40800,-0.425158
4,1.955666,1.07800,-0.877666
...,...,...,...
4123,0.948856,1.06300,0.114144
4124,3.325331,3.93700,0.611669
4125,2.130771,1.31300,-0.817771
4126,1.025095,0.92300,-0.102095


In [95]:
r2 = r2_score(y_test, y_pred)
r2

0.5417943779002213