In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm

# Formulation

## First Stage

$ avexpr = \beta_0+ \beta_1 logem4 + \epsilon $

Recall that the **instrument** is the `logem4`

In [2]:
# Import and select the data
df4 = pd.read_stata('https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable4.dta')
df4 = df4[df4['baseco'] == 1] # to ensure we have no missing values.

# Add a constant variable
df4['const'] = 1 # broadcasting

# Fit the first stage regression and print summary
results_fs = sm.OLS(df4['avexpr'],# y-val
                    df4[['const', 'logem4']],# x-val (endogenous)
                    missing='drop').fit()
print(results_fs.summary())

                            OLS Regression Results                            
Dep. Variable:                 avexpr   R-squared:                       0.270
Model:                            OLS   Adj. R-squared:                  0.258
Method:                 Least Squares   F-statistic:                     22.95
Date:                Sun, 04 Mar 2018   Prob (F-statistic):           1.08e-05
Time:                        18:41:06   Log-Likelihood:                -104.83
No. Observations:                  64   AIC:                             213.7
Df Residuals:                      62   BIC:                             218.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.3414      0.611     15.296      0.0

# Second Stage

In [3]:
df4['predicted_avexpr'] = results_fs.predict() #same as fittedvalues.

In [4]:
all(df4['predicted_avexpr'].values == results_fs.fittedvalues)

True

In [5]:
results_ss = sm.OLS(df4['logpgp95'],
                    df4[['const', 'predicted_avexpr']]).fit()
print(results_ss.summary())

                            OLS Regression Results                            
Dep. Variable:               logpgp95   R-squared:                       0.477
Model:                            OLS   Adj. R-squared:                  0.469
Method:                 Least Squares   F-statistic:                     56.60
Date:                Sun, 04 Mar 2018   Prob (F-statistic):           2.66e-10
Time:                        18:41:06   Log-Likelihood:                -72.268
No. Observations:                  64   AIC:                             148.5
Df Residuals:                      62   BIC:                             152.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                1.9097      0.823  

## Warning

The second-stage regression results give us an unbiased and consistent estimate of the effect of institutions on economic outcomes

The result suggests a stronger positive relationship than what the OLS results indicated

Note that while our parameter estimates are correct, our standard errors are not and for this reason, computing 2SLS ‘manually’ (in stages with OLS) is not recommended


## note, we install linearmodels package, which extends statsmodels.

https://bashtage.github.io/linearmodels/doc/

In [6]:
from linearmodels.iv import IV2SLS

In [7]:
iv = IV2SLS(dependent=df4['logpgp95'],
            exog=df4['const'],
            endog=df4['avexpr'],
            instruments=df4['logem4']).fit(cov_type='unadjusted')

print(iv.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:               logpgp95   R-squared:                      0.1870
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1739
No. Observations:                  64   F-statistic:                    37.568
Date:                Sun, Mar 04 2018   P-value (F-stat)                0.0000
Time:                        18:41:06   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          1.9097     1.0106     1.8897     0.0588     -0.0710      3.8903
avexpr         0.9443     0.1541     6.1293     0.00

## Notice how the estimate is identical, however, the standard error is now correct. Never manually calculate 2SLS.

In [8]:
iv.params[-1]

0.94427938515480037

## Here is how to do the same thing in statsmodels!

http://www.statsmodels.org/stable/generated/statsmodels.sandbox.regression.gmm.IV2SLS.html?highlight=2sls#statsmodels.sandbox.regression.gmm.IV2SLS

In [9]:
from statsmodels.sandbox.regression.gmm import IV2SLS as IV

In [34]:
test = IV(endog=df4['logpgp95'].values, exog=df4[["const", "avexpr"]].values, instrument=df4[["const",'logem4']].values).fit()

In [35]:
test.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.187
Model:,IV2SLS,Adj. R-squared:,0.174
Method:,Two Stage,F-statistic:,36.39
,Least Squares,Prob (F-statistic):,9.8e-08
Date:,"Sun, 04 Mar 2018",,
Time:,18:49:08,,
No. Observations:,64,,
Df Residuals:,62,,
Df Model:,1,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.9097,1.027,1.860,0.068,-0.143,3.962
x1,0.9443,0.157,6.033,0.000,0.631,1.257

0,1,2,3
Omnibus:,1.738,Durbin-Watson:,2.038
Prob(Omnibus):,0.419,Jarque-Bera (JB):,1.491
Skew:,-0.372,Prob(JB):,0.475
Kurtosis:,2.922,Cond. No.,31.2


In [38]:
test.params[-1]

0.94427938515483767