In [3]:
import numpy as np
import pandas as pd
import sklearn


In [6]:
#he Boston housing prices dataset has an ethical problem. You can refer to
#the documentation of this function for further details.
#The scikit-learn maintainers therefore strongly discourage the use of this
#dataset unless the purpose of the code is to study and educate about
#ethical issues in data science and machine learning.

from sklearn.datasets import fetch_california_housing
df = fetch_california_housing()

In [7]:
df.keys()


dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [10]:
print(df.feature_names) #info about dataset

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [11]:
print(df.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [13]:
print(df.target)

[4.526 3.585 3.521 ... 0.923 0.847 0.894]


In [18]:
california = pd.DataFrame(df.data, columns=df.feature_names)
fetch_california_housing()

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [19]:
california['MedInc'] = df.target
california.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,4.526,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,3.585,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,3.521,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,3.413,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.422,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [20]:
california.isnull()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
20635,False,False,False,False,False,False,False,False
20636,False,False,False,False,False,False,False,False
20637,False,False,False,False,False,False,False,False
20638,False,False,False,False,False,False,False,False


In [21]:
california.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [23]:
from sklearn.model_selection import train_test_split
X = california.drop('MedInc', axis=1)
Y = california['MedInc']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(17544, 7)
(3096, 7)
(17544,)
(3096,)


**Now let us import the linear regression model from sklearn and train it on the training dataset**


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error





In [27]:
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression()

In [30]:
y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))

print('The model performance for training set')
print('RMSE is {}'.format(rmse))
print('\n')

#on testing set

y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))

print("the model performance fortesting set")
print('RMSE is {}'.format(rmse))

The model performance for training set
RMSE is 0.8914607591901219


the model performance fortesting set
RMSE is 0.9098098485839422
