In [4]:
## Loading the required packages
import statsmodels.api as sm #for fitting machine learning models
import sklearn #for fitting machine learning models
from sklearn import datasets #dataset to be used in this example
import numpy as np #for mathematical formulations
import pandas as pd
from sklearn import linear_model

In [5]:
## Loading the boston dataset
boston = datasets.load_boston()

## Examining the description of the data.
print(boston.DESCR)


Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [6]:
## Set the explanatory variables and the target variable as two different variables
df = pd.DataFrame(boston.data,columns = boston.feature_names)

target = pd.DataFrame(boston.target, columns = ["MEDV"])

In [7]:
## Fitting an OLS model, with one explanatory variable, i.e RM, average number of rooms, without a constant
x = df["RM"] # defining the x (explanatory) variable
y = target["MEDV"] # defining the target variable

modelSM1 = sm.OLS(endog=y,exog=x).fit() #fitting the model 
modelSM1.summary() #extracting results of the model

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.901
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Sat, 30 Jun 2018",Prob (F-statistic):,3.7399999999999996e-256
Time:,17:52:46,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [8]:
## Fitting an OLS model, with one explanatory variable, i.e RM, average number of rooms, with a constant
x=df["RM"]
x = sm.add_constant(x) #Adding the constant to the model
y = target["MEDV"]

modelSM2 = sm.OLS(endog=y,exog=x).fit() 
modelSM2.summary()


0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,471.8
Date:,"Sat, 30 Jun 2018",Prob (F-statistic):,2.49e-74
Time:,17:52:47,Log-Likelihood:,-1673.1
No. Observations:,506,AIC:,3350.0
Df Residuals:,504,BIC:,3359.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-34.6706,2.650,-13.084,0.000,-39.877,-29.465
RM,9.1021,0.419,21.722,0.000,8.279,9.925

0,1,2,3
Omnibus:,102.585,Durbin-Watson:,0.684
Prob(Omnibus):,0.0,Jarque-Bera (JB):,612.449
Skew:,0.726,Prob(JB):,1.02e-133
Kurtosis:,8.19,Cond. No.,58.4


In [9]:
## Fitting an OLS model, with two explanatory variables, i.e RM and LSTAT
y = target["MEDV"]
x = df[["RM","LSTAT"]]
x=sm.add_constant(x)

modelSM3 = sm.OLS(endog=y,exog=x).fit()
modelSM3.summary()



0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.639
Model:,OLS,Adj. R-squared:,0.637
Method:,Least Squares,F-statistic:,444.3
Date:,"Sat, 30 Jun 2018",Prob (F-statistic):,7.0099999999999995e-112
Time:,17:52:48,Log-Likelihood:,-1582.8
No. Observations:,506,AIC:,3172.0
Df Residuals:,503,BIC:,3184.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.3583,3.173,-0.428,0.669,-7.592,4.875
RM,5.0948,0.444,11.463,0.000,4.222,5.968
LSTAT,-0.6424,0.044,-14.689,0.000,-0.728,-0.556

0,1,2,3
Omnibus:,145.712,Durbin-Watson:,0.834
Prob(Omnibus):,0.0,Jarque-Bera (JB):,457.69
Skew:,1.343,Prob(JB):,4.1100000000000003e-100
Kurtosis:,6.807,Cond. No.,202.0


In [10]:
## Fitting an OLS model with all the explanatory variables
y = target["MEDV"]
x = df
x = sm.add_constant(x)

model4 = sm.OLS(endog=y,exog=x).fit()
model4.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Sat, 30 Jun 2018",Prob (F-statistic):,6.95e-135
Time:,17:52:49,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4911,5.104,7.149,0.000,26.462,46.520
CRIM,-0.1072,0.033,-3.276,0.001,-0.171,-0.043
ZN,0.0464,0.014,3.380,0.001,0.019,0.073
INDUS,0.0209,0.061,0.339,0.735,-0.100,0.142
CHAS,2.6886,0.862,3.120,0.002,0.996,4.381
NOX,-17.7958,3.821,-4.658,0.000,-25.302,-10.289
RM,3.8048,0.418,9.102,0.000,2.983,4.626
AGE,0.0008,0.013,0.057,0.955,-0.025,0.027
DIS,-1.4758,0.199,-7.398,0.000,-1.868,-1.084

0,1,2,3
Omnibus:,178.029,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,782.015
Skew:,1.521,Prob(JB):,1.54e-170
Kurtosis:,8.276,Cond. No.,15100.0


In [11]:
## Fitting a full model using sklearn
x = df[["RM","LSTAT"]]
y = target["MEDV"]

lm = linear_model.LinearRegression().fit(x,y)


In [12]:
## Obtain R-Squared 
lm.score(x,y)


0.6385616062603403

In [13]:
## Obtain Coefficients
lm.coef_


array([ 5.09478798, -0.64235833])

In [14]:
##Obtain the intercept
lm.intercept_

-1.3582728118745315