# Advanced Econometrics II
### Computer Assignment - Weak identification

In [None]:
import pandas as pd
import numpy as np
import statsmodels
import matplotlib.pyplot as plt
from numpy.linalg import inv as inv
import scipy.stats as stats

In [None]:
def projection_matrix(X):
    if X.shape == (X.shape[0],):
        X = X.reshape((X.shape[0], 1))
        
    return X @ np.linalg.inv(X.T @ X) @ X.T
        

def orthogonal_projection_matrix(X):
    return np.identity(X.shape[0]) - projection_matrix(X)

def pi_sim(y, X, betaH0, Z):
    
    if Z.shape == (Z.shape[0],):
        Z = Z.reshape((Z.shape[0], 1))
        
    N = Z.shape[0]
    k = Z.shape[1]
    
    eps = y - X * betaH0
    sigma_eps_hat = (1 / (N - k)) * eps.T @ orthogonal_projection_matrix(Z) @ eps
    sigma_eps_V_hat = (1 / (N - k)) * eps.T @ orthogonal_projection_matrix(Z) @ X
    sigma_V_eps_hat = (1 / (N - k)) * X.T @ orthogonal_projection_matrix(Z) @ eps
    sigma_V_hat = (1 / (N - k)) * X.T @ orthogonal_projection_matrix(Z) @ X
    
    rho_hat = sigma_eps_V_hat / sigma_eps_hat
    pi_sim = inv(Z.T @ Z) @ Z.T @ (X - eps * rho_hat)
    
    return pi_sim, sigma_eps_hat, sigma_eps_V_hat, sigma_V_eps_hat, sigma_V_hat
    
def k_AR_r_LM_beta(y, X, betaH0, Z):
    
    if Z.shape == (Z.shape[0],):
        Z = Z.reshape((Z.shape[0], 1))
    
    
    pi, sigma_eps_hat, sigma_eps_V_hat, sigma_V_eps_hat, sigma_V_hat = pi_sim(y, X, betaH0, Z)
 
    eps = y - X * betaH0
    
    sigma_hat_VV_dot_eps = sigma_V_hat - (sigma_V_eps_hat * sigma_eps_V_hat) / sigma_eps_hat
    
    r_beta = (1 / sigma_hat_VV_dot_eps) * pi.T @ (Z.T @ Z) @ pi
    LM_beta = (1 / sigma_eps_hat) * eps.T @ projection_matrix(Z @ pi) @ eps
    k_AR_beta = (1 / sigma_eps_hat) * eps.T @ projection_matrix(Z) @ eps
      
    return k_AR_beta, r_beta, LM_beta

def LR_beta(y, X, betaH0, Z):
    k_AR_beta, r_beta, LM_beta = k_AR_r_LM_beta(y, X, betaH0, Z)
    
    return (0.5) * (k_AR_beta - r_beta + np.sqrt((k_AR_beta + r_beta) ** 2 - 4 * r_beta * (k_AR_beta - LM_beta)))

## Question 1

In [None]:
#create data
rho_list = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
a_list = [1, 0.6, 0.3, 0.15, 0.07, 0.04, 0.02, 0]
N = 100
k = 10
MC = 5000
Z = np.random.normal(loc=0, scale=1, size=(N, k))
beta_0 = 0
results = np.zeros(MC)
R_freq = np.zeros([len(rho_list), len(a_list)])


for a in a_list:
    for rho in rho_list:
        for l in range(MC):
            np.random.seed(1211)
            pi = np.zeros(k)
            pi[0] = a
            sigma = np.array([[1, rho], [rho, 1]])
            eps_v = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=(N))
            eps = eps_v[:, 0]
            v = eps_v[:, 1]

            X = Z @ pi + v
            Y = X * beta_0 + eps
            Pz = Z @ np.linalg.inv(Z.T @ Z) @ Z.T


            beta_2SLS = (X.T @ Pz @ X) ** (-1) * (X.T @ Pz @ Y)
            residuals = Y - X.T * beta_2SLS
            Shat = np.zeros((10, 10))
            for i in range(Z.shape[1]):
                Shat += (N-k) ** (-1) * residuals[i] ** 2 * (np.outer(Z[i, ], Z[i, ]))

            beta_2SLS_var = (N-1) ** (-1) * (Y - X * beta_2SLS).T @ (Y - X * beta_2SLS) *\
                            (X.T @ Pz @ X) ** (-1)

            tstat = (beta_2SLS - beta_0) / np.sqrt(beta_2SLS_var)
            results[l] = (tstat < -1.96) | (tstat > 1.96)
            
        R_freq[rho_list.index(rho), a_list.index(a)] = round(np.mean(results), 3)

In [None]:
df = pd.DataFrame(R_freq, rho_list)
df.columns = a_list
#df.set_index(rho_list)
print(df)


for f in range(8):
    plt.plot(R_freq[:, f])
    plt.ylabel('rej frequency')
    plt.title('a is ' + str(a_list[f]))
    plt.xticks(np.arange(11),rho_list)
    plt.show()
    


## Question 2

In [None]:
k = 10
MC = 5000

r_beta = np.arange(0, 200.25, 0.25)

crit_val_95 = pd.DataFrame({'r_beta' : [], 'crit_val' : []})

for r in r_beta:
    np.random.seed(1211)
    vidle_1 = np.random.chisquare(df=1, size=([MC, 1]))
    vidle_k = np.random.chisquare(df=k - 1, size=([MC, 1]))
    
    LR = 0.5 * (vidle_k + vidle_1 - r + np.sqrt((vidle_k + vidle_1 + r) ** 2 - 4 * r * vidle_k))
    
    LR_crit_val = np.percentile(LR, q=95, interpolation='higher')
    
    crit_val_95 = crit_val_95.append(pd.DataFrame({'r_beta' : [r], 'crit_val' : [LR_crit_val]}), ignore_index=True)
    


In [None]:
crit_val_95.plot(x='r_beta', y='crit_val')
plt.show()

## Question 4

In [None]:
k = 4
MC = 5000

r_beta = np.arange(0, 200.25, 0.25)

crit_val_95 = pd.DataFrame({'r_beta' : [], 'crit_val' : []})

for r in r_beta:
    np.random.seed(1211)
    vidle_1 = np.random.chisquare(df=1, size=([MC, 1]))
    vidle_k = np.random.chisquare(df=k - 1, size=([MC, 1]))
    
    LR = 0.5 * (vidle_k + vidle_1 - r + np.sqrt((vidle_k + vidle_1 + r) ** 2 - 4 * r * vidle_k))
    
    LR_crit_val = np.percentile(LR, q=95, interpolation='higher')
    
    
    
    crit_val_95 = crit_val_95.append(pd.DataFrame({'r_beta' : [r], 'crit_val' : [LR_crit_val]}), ignore_index=True)
    


In [None]:
crit_val_95.plot(x='r_beta', y='crit_val')
plt.show()

## Question 5

In [None]:
import pandas as pd
import numpy as np
import statsmodels
import matplotlib.pyplot as plt
from numpy.linalg import inv as inv
import scipy.stats as stats

In [None]:
data = pd.read_csv('dest.csv', 
                header=None, 
                names=['age', 'age2', 'ed',
                       'exper', 'exper2', 'nearc2',
                       'nearc4', 'nearc4a', 'nearc4b',
                       'race', 'smsa', 'south', 'wage'])

In [None]:
ys = data.loc[:, 'wage'].values
Xs = data.loc[:, 'ed'].values
#Z = data.loc[:, ['nearc2', 'nearc4', 'nearc4a', 'nearc4b']]
W_noC = data.loc[:, ['exper', 'exper2', 'south', 'smsa', 'race']]
constant = pd.DataFrame({'constant' : np.ones(W_noC.shape[0])})
W = pd.concat([W_noC, constant], axis=1).values

#### Subquestion a

In [None]:
Zs = data.loc[:, 'nearc2'].values
Zs = Zs.reshape((Zs.shape[0], 1))

Mw = orthogonal_projection_matrix(W)

y = Mw @ ys
X = Mw @ Xs
Z = Mw @ Zs

In [None]:
%%time

N = Z.shape[0]
k = Z.shape[1]
beta = list(np.arange(-10, 10.1, 0.1))
beta = [round(b, 1) for b in beta]

CV_chi2 = stats.chi2.ppf(q=0.95, df=k) / k
AR_stat = np.zeros(len(beta))
t_stat = np.zeros(len(beta))

beta_2SLS = (X.T @ projection_matrix(Z) @ X) ** (-1) * (X.T @ projection_matrix(Z) @ y)
beta_2SLS_var = (N - 1) ** (-1) * (y - X * beta_2SLS).T @ (y - X * beta_2SLS) *\
                            (X.T @ projection_matrix(Z) @ X) ** (-1)

for b in beta:    
    np.random.seed(1211)
    
    eps = y - X * b
    AR_stat[beta.index(b)] = ((eps.T @ projection_matrix(Z) @ eps) / k) / ((eps.T @ orthogonal_projection_matrix(Z) @ eps) / (N - k))
    t_stat[beta.index(b)] = (beta_2SLS - b) / np.sqrt(beta_2SLS_var)
    

print('Coverage frequency for AR: ', np.mean(AR_stat <= CV_chi2))
print('Coverage frequency for t: ', np.mean((t_stat < beta_2SLS + 1.96 * np.sqrt(beta_2SLS_var)) 
                                            & (t_stat > beta_2SLS - 1.96 * np.sqrt(beta_2SLS_var))))

In [None]:
plt.plot(beta, np.repeat(CV_chi2, AR_stat.shape[0]), label='AR_critical')
plt.plot(beta, AR_stat, label='AR_stat')
#plt.plot(beta, t_stat)
plt.legend()
plt.show()

#### Subquestion c

In [None]:
%%time

beta = list(np.arange(0, 150, 1))
beta = [round(b, 1) for b in beta]

CV_chi2 = stats.chi2.ppf(q=0.95, df=k) / k
AR_stat = np.zeros(len(beta))
F_stat = np.zeros(len(beta))

pi_hat = inv(Z.T @ Z) @ Z.T @ X
sigma_vv_hat = (X.T @ orthogonal_projection_matrix(Z) @ X) / (N - k)

for b in beta:
    
    eps = y - X * b
    
    AR_stat[beta.index(b)] = ((eps.T @ projection_matrix(Z) @ eps) / k) / ((eps.T @ orthogonal_projection_matrix(Z) @ eps) / (N - k))
    F_stat[beta.index(b)] = (pi_hat @ Z.T @ Z @ pi_hat) / (k * sigma_vv_hat)

In [None]:
plt.plot(beta, np.repeat(CV_chi2, AR_stat.shape[0]))
plt.plot(beta, AR_stat)
plt.plot(beta, F_stat)
plt.legend()
plt.show()