# Instrumental Varibles

This notebook defines a function for 2SLS and illustrates it by redoing ab example from Ch 10.3.3 in "Principles of Econometrics", 3rd edition (Hill, Griffiths and Lim).

## Load Packages and Extra Functions

In [1]:
using Printf, DelimitedFiles, LinearAlgebra, Statistics

include("jlFiles/printmat.jl")
include("jlFiles/Ols.jl")
include("jlFiles/CovNWFn.jl")

CovNWFn

## Loading Data

In [2]:
(x,header) = readdlm("Data/mrozEd.txt",header=true)
namesB = tuple(Symbol.(header)...)                            #a tuple (:X,:Y,:Z)
N      = NamedTuple{namesB}([x[:,i] for i=1:size(x,2)])       #NamedTuple with N.X, N.Y and N.Z

c     = ones(size(x,1))                                       #constant

println("The variables in N (use as N.x): ")
printmat(keys(N))

The variables in N (use as N.x): 
(Symbol("%taxableinc"), :federaltax, :hsiblings, :hfathereduc, :hmothereduc, :siblings, :lfp, :hours, :kidsl6, :kids618, :age, :educ, :wage, :wage76, :hhours, :hage, :heduc, :hwage, :faminc, :mtr, :mothereduc, :fathereduc, :unemployment, :bigcity, :exper)



# A Function for IV & 2SLS


In [3]:
"""
    TwoSLSFn(y,x,z,NWQ=false,m=0)

# Input
- `y::VecOrMat`:      Tx1 or T-vector of the dependent variable
- `x::Matrix`:        TxK matrix of regressors
- `z::VecOrMat`:      TxL matrix of instruments
- `NWQ:Bool`:         if true, then Newey-West's covariance matrix is used, otherwise Gauss-Markov
- `m::Int`:           scalar, bandwidth in Newey-West

# Output
- `b::Vector`:             K-vector, regression coefficients
- `fnOutput::NamedTuple`:  with
  - res                Tx1 or Txn matrix, residuals y - yhat
  - yhat               Tx1 or Txn matrix, fitted values
  - Covb               matrix, covariance matrix of vec(b) = [beq1;beq2;...]
  - R2                 1xn, R2
  - R2_stage1          K-vector, R2 of each x[:,i] in first stage regression on z
  - δ_stage1           LxK matrix, coeffs from 1st stage x = z'δ
  - Stdδ_stage1        LxK matrix, std of δ

# Requires
- Statistics, LinearAlgebra
- CovNWFn


"""
function TwoSLSFn(y,x,z,NWQ=false,m=0)

    (Ty,n) = (size(y,1),size(y,2))
    (K,L)  = (size(x,2),size(z,2))

    δ         = z\x             #LxK, one column for each regression
    xhat      = z*δ             #TxL * LxK - > TxK
    resx      = x - xhat        #TxK
    R2_stage1 = [cor(x[:,i],xhat[:,i])^2  for i=1:K]

    Szz_1 = inv(z'z)
    Stdδ  = similar(δ)           #LxK standard errors of δ
    for i = 1:K                  #loop over columns in x
        if NWQ                   #NW standard errors
            S      = CovNWFn(resx[:,i].*z,m)
            Covδ_i = Szz_1*S*Szz_1
        else                     #standard errors assuming iid
            Covδ_i = Szz_1*var(resx[:,i])
        end
        Stdδ[:,i] = sqrt.(diag(Covδ_i))
    end

    b    = xhat\y
    yhat = x*b               #notice: from y=x'b+u, not 2nd stage regression
    res  = y - yhat

    R2   = cor(y,yhat)^2
    Sxz  = x'z    
    if NWQ     #Cov(b) using Newey-West 
        S    = CovNWFn(res.*z,m)             
        B    = inv(Sxz*Szz_1*Sxz')*Sxz*Szz_1
        Covb = B*S*B'        
    else       #Cov(b) assuming iid residuals, independent of z
        Covb = var(res)*inv(Sxz*Szz_1*Sxz')
    end
    
    fnOutput = (;res,yhat,Covb,R2,R2_stage1,δ_stage1=δ,Stdδ_stage1=Stdδ)

    return b, fnOutput

end

TwoSLSFn

In [4]:
vv     = N.wage .> 0
                         #OLS on wage>0
(b_OLS,_,_,Covb,) = OlsGMFn(log.(N.wage[vv]),[c N.educ N.exper N.exper.^2][vv,:])
Stdb_ols = sqrt.(diag(Covb))

colNames = ["coef","std"]
rowNames = ["c","educ","exper","exper^2"]
println("OLS estimates")
printmat(b_OLS,Stdb_ols;colNames,rowNames,prec=4)

OLS estimates
             coef       std
c         -0.5220    0.1979
educ       0.1075    0.0141
exper      0.0416    0.0131
exper^2   -0.0008    0.0004



In [5]:
(b_iv,fO2) = TwoSLSFn(log.(N.wage[vv]),[c N.educ N.exper N.exper.^2][vv,:],
                      [c N.exper N.exper.^2 N.mothereduc][vv,:])


zNames = ["c","exper","exper^2","mothereduc"]
println("first-stage estimates: coeffs")
printmat(fO2.δ_stage1;colNames=rowNames,rowNames=zNames)
println("first-stage estimates: std errors")
printmat(fO2.Stdδ_stage1;colNames=rowNames,rowNames=zNames)

Stdb_iv = sqrt.(diag(fO2.Covb))
println("IV estimates")
printmat(b_iv,Stdb_iv;colNames,rowNames,prec=4)

printblue("The results should be very close to Hill et al, 10.3.3,
but with small differences due to how df adjustments are made to variances")

first-stage estimates: coeffs
                   c      educ     exper   exper^2
c              1.000     9.775    -0.000     0.000
exper          0.000     0.049     1.000    -0.000
exper^2       -0.000    -0.001     0.000     1.000
mothereduc     0.000     0.268    -0.000    -0.000

first-stage estimates: std errors
                   c      educ     exper   exper^2
c              0.000     0.422     0.000     0.000
exper          0.000     0.042     0.000     0.000
exper^2        0.000     0.001     0.000     0.000
mothereduc     0.000     0.031     0.000     0.000

IV estimates
             coef       std
c          0.1982    0.4712
educ       0.0493    0.0373
exper      0.0449    0.0135
exper^2   -0.0009    0.0004

[34m[1mThe results should be very close to Hill et al, 10.3.3,[22m[39m
[34m[1mbut with small differences due to how df adjustments are made to variances[22m[39m
