## import required packages

In [1]:
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

  from pandas.core import datetools


## import boston house pricing dataset which is their in sklearn package

In [2]:
from sklearn.datasets import load_boston
boston=load_boston()
type(boston)

sklearn.datasets.base.Bunch

In [3]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], 
      dtype='<U7')

In [4]:
X=boston.data
Y=boston.target

In [5]:
bos=pd.DataFrame(boston.data)

In [6]:
print(bos.head())

        0     1     2    3      4      5     6       7    8      9     10  \
0  0.00632  18.0  2.31  0.0  0.538  6.575  65.2  4.0900  1.0  296.0  15.3   
1  0.02731   0.0  7.07  0.0  0.469  6.421  78.9  4.9671  2.0  242.0  17.8   
2  0.02729   0.0  7.07  0.0  0.469  7.185  61.1  4.9671  2.0  242.0  17.8   
3  0.03237   0.0  2.18  0.0  0.458  6.998  45.8  6.0622  3.0  222.0  18.7   
4  0.06905   0.0  2.18  0.0  0.458  7.147  54.2  6.0622  3.0  222.0  18.7   

       11    12  
0  396.90  4.98  
1  396.90  9.14  
2  392.83  4.03  
3  394.63  2.94  
4  396.90  5.33  


In [7]:
bos.columns=boston.feature_names

In [8]:
bos['PRICE']=boston.target

In [9]:
print(bos.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  PRICE  
0     15.3  396.90   4.98   24.0  
1     17.8  396.90   9.14   21.6  
2     17.8  392.83   4.03   34.7  
3     18.7  394.63   2.94   33.4  
4     18.7  396.90   5.33   36.2  


## remove some features based on pearson (as done in previous ipyb)

In [10]:
bos=bos.drop('ZN',axis=1)
bos=bos.drop('CHAS',axis=1)
bos=bos.drop('RM',axis=1)
bos=bos.drop('DIS',axis=1)
bos=bos.drop('B',axis=1)


In [11]:
X=bos.drop('PRICE',axis=1)
Y=bos['PRICE']

## split the total data into training and test,here used 33% for testing

In [12]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(339, 8)
(167, 8)
(339,)
(167,)




In [13]:
bos.head()

Unnamed: 0,CRIM,INDUS,NOX,AGE,RAD,TAX,PTRATIO,LSTAT,PRICE
0,0.00632,2.31,0.538,65.2,1.0,296.0,15.3,4.98,24.0
1,0.02731,7.07,0.469,78.9,2.0,242.0,17.8,9.14,21.6
2,0.02729,7.07,0.469,61.1,2.0,242.0,17.8,4.03,34.7
3,0.03237,2.18,0.458,45.8,3.0,222.0,18.7,2.94,33.4
4,0.06905,2.18,0.458,54.2,3.0,222.0,18.7,5.33,36.2


In [14]:
len(X_train)

339

## forming a proper x matrix for taining of equation y=bx  to get proper b values(good model)

In [15]:
x=np.matrix(X_train[:]['CRIM'].values.reshape(339,1))

In [16]:
x=np.concatenate((x,np.matrix(X_train[:]['INDUS'].values.reshape(339,1))),axis=1)
x=np.concatenate((x,np.matrix(X_train[:]['NOX'].values.reshape(339,1))),axis=1)
x=np.concatenate((x,np.matrix(X_train[:]['AGE'].values.reshape(339,1))),axis=1)
x=np.concatenate((x,np.matrix(X_train[:]['RAD'].values.reshape(339,1))),axis=1)
x=np.concatenate((x,np.matrix(X_train[:]['TAX'].values.reshape(339,1))),axis=1)
x=np.concatenate((x,np.matrix(X_train[:]['PTRATIO'].values.reshape(339,1))),axis=1)
x=np.concatenate((x,np.matrix(X_train[:]['LSTAT'].values.reshape(339,1))),axis=1)


In [17]:
z=np.matrix(np.repeat(1,len(X_train))).reshape(len(X_train),1)
x=np.concatenate((z,x),axis=1)
x[:][:4]

matrix([[  1.00000000e+00,   1.11604000e+01,   1.81000000e+01,
           7.40000000e-01,   9.46000000e+01,   2.40000000e+01,
           6.66000000e+02,   2.02000000e+01,   2.32700000e+01],
        [  1.00000000e+00,   5.66000000e-02,   3.41000000e+00,
           4.89000000e-01,   8.63000000e+01,   2.00000000e+00,
           2.70000000e+02,   1.78000000e+01,   5.50000000e+00],
        [  1.00000000e+00,   4.55587000e+00,   1.81000000e+01,
           7.18000000e-01,   8.79000000e+01,   2.40000000e+01,
           6.66000000e+02,   2.02000000e+01,   7.12000000e+00],
        [  1.00000000e+00,   1.02900000e-01,   4.93000000e+00,
           4.28000000e-01,   5.29000000e+01,   6.00000000e+00,
           3.00000000e+02,   1.66000000e+01,   1.12200000e+01]])

## form a proper y matrix for training of equation y=xb to get proper b values(good model)

In [18]:
y=np.matrix(Y_train[:][:].values.reshape(len(Y_train),1))

In [19]:
y[:][:4]

matrix([[ 13.4],
        [ 23.6],
        [ 27.5],
        [ 22.2]])

## b=( (x_transpose) * x )inverse * x_transpose * y

In [20]:
x_transp=np.transpose(x)

In [21]:
x_transp_x=x_transp*x

In [22]:
from numpy.linalg import inv
x_transp_x_inv=inv(x_transp_x)

In [23]:
b=x_transp_x_inv*x_transp*y

## preparing x matrix for testing of y=xb

In [24]:
x=np.matrix(X_test[:]['CRIM'].values.reshape(len(X_test),1))
x=np.concatenate((x,np.matrix(X_test[:]['INDUS'].values.reshape(len(X_test),1))),axis=1)
x=np.concatenate((x,np.matrix(X_test[:]['NOX'].values.reshape(len(X_test),1))),axis=1)
x=np.concatenate((x,np.matrix(X_test[:]['AGE'].values.reshape(len(X_test),1))),axis=1)
x=np.concatenate((x,np.matrix(X_test[:]['RAD'].values.reshape(len(X_test),1))),axis=1)
x=np.concatenate((x,np.matrix(X_test[:]['TAX'].values.reshape(len(X_test),1))),axis=1)
x=np.concatenate((x,np.matrix(X_test[:]['PTRATIO'].values.reshape(len(X_test),1))),axis=1)
x=np.concatenate((x,np.matrix(X_test[:]['LSTAT'].values.reshape(len(X_test),1))),axis=1)

z=np.matrix(np.repeat(1,len(X_test))).reshape(len(X_test),1)
x=np.concatenate((z,x),axis=1)
x[:][:4]


matrix([[  1.00000000e+00,   3.82140000e-01,   6.20000000e+00,
           5.04000000e-01,   8.65000000e+01,   8.00000000e+00,
           3.07000000e+02,   1.74000000e+01,   3.13000000e+00],
        [  1.00000000e+00,   3.61500000e-02,   4.95000000e+00,
           4.11000000e-01,   2.34000000e+01,   4.00000000e+00,
           2.45000000e+02,   1.92000000e+01,   4.70000000e+00],
        [  1.00000000e+00,   4.68400000e-02,   3.41000000e+00,
           4.89000000e-01,   6.61000000e+01,   2.00000000e+00,
           2.70000000e+02,   1.78000000e+01,   8.81000000e+00],
        [  1.00000000e+00,   1.11081000e+01,   1.81000000e+01,
           6.68000000e-01,   1.00000000e+02,   2.40000000e+01,
           6.66000000e+02,   2.02000000e+01,   3.47700000e+01]])

In [25]:
## preparing y matrix for testing of y=bx

In [26]:
y=np.matrix(Y_test[:][:].values.reshape(len(Y_test),1))

In [27]:
len(y)

167

In [28]:
e=y-(x*b)

In [29]:
len(e)

167

## finally calculating error 

In [30]:
error=0
for i in range(0,len(e)):
    error+=e[i]**2
print("error is ",np.sqrt(error/len(e)))

error is  [[ 6.12067267]]
