In [2]:
import numpy as np 
import pandas as pd

In [3]:
from sklearn.datasets import load_boston #import dataset

In [9]:
boston_dataset = load_boston() #instantiate the dataset

In [11]:
print(boston_dataset['DESCR']) #inbuilt description function to describe the dataset

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [12]:
#inbuilt feature names
print(boston_dataset['feature_names'])

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [14]:
#store data in a dataframe
df_boston = pd.DataFrame(boston_dataset.data)

#set features as columns 
df_boston.columns = boston_dataset.feature_names

In [15]:
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [16]:
df_boston.shape

(506, 13)

In [19]:
df_boston['Price']= boston_dataset.target

In [21]:
#set x and y
X = boston_dataset.data
Y = boston_dataset.target

In [22]:
#import linear regression
from sklearn.linear_model import LinearRegression

In [23]:
linReg = LinearRegression()

In [25]:
#fit data
linReg.fit(X, Y)

LinearRegression()

In [27]:
print('The estimated intercept is: %.2f' %linReg.intercept_)

The estimated intercept is: 36.46


In [33]:
print('The estimated coefficient is: %d' %len(linReg.coef_))

The estimated coefficient is: 13


In [34]:
#train model
from sklearn.model_selection import train_test_split

In [35]:
#split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [38]:
print(X_train.shape,Y_train.shape,X_test.shape, Y_test.shape)


(379, 13) (379,) (127, 13) (127,)


In [39]:
#fit model
linReg.fit(X_train, Y_train)

#predict
linReg.predict(X_test)

array([34.36918006, 14.40746385, 16.93225233, 16.66710533, 20.5068027 ,
       20.02809279, 35.03640119, 21.14307023, 25.40181745, 35.37747644,
       13.08846079, 20.94037373, 13.62442185, 13.24217382, 20.57303394,
       13.41368263, 34.61749136, 33.68420556, 24.55025485, 15.27970428,
       38.17762828, 20.88468293, 17.56092436, 16.90364109, 23.51946767,
       31.6722121 , 22.61546571, 17.33200237, 24.24881083, 18.73167672,
       21.11873431, 20.11626171, 13.21275638, 13.87323721, 19.64830759,
       44.39068379, 17.98296072, 20.9455033 , 24.53245569, 16.88888501,
       24.89878025, 32.15675031, 18.70422287,  9.79720008, 26.37138132,
        7.70320893, 10.43463953, 35.49318953, 28.46587206, 32.34249502,
       29.07487802,  6.17782034, 15.42512792, 23.57074747, 18.64827096,
       27.97373986, 22.59923717, 15.00151856, 22.27024406, 16.49973152,
       35.47908409, 25.93072437, 35.35940153, 14.97837902, 22.5137652 ,
       20.19432523, 26.80761275,  8.22210534,  7.42719064, 25.66

In [50]:
# calculate MSE
print('Mean Squared Error %.2f' % np.mean((linReg.predict(X_test)-Y_test)**2))

Mean Squared Error 21.43


In [52]:
#calculate variance (closer to 1 the better)
print('Variance Score %.2f' % linReg.score(X_test, Y_test))

Variance Score 0.77
