# Advanced Econometrics II
### Computer Assignment - Weak identification

In [None]:
import pandas as pd
import numpy as np
import statsmodels
import matplotlib.pyplot as plt
from numpy.linalg import inv as inv
import scipy.stats as stats

In [None]:
def projection_matrix(X):
    if X.shape == (X.shape[0],):
        X = X.reshape((X.shape[0], 1))
        
    return X @ np.linalg.inv(X.T @ X) @ X.T
        

def orthogonal_projection_matrix(X):
    return np.identity(X.shape[0]) - projection_matrix(X)

def pi_sim(y, X, betaH0, Z):
    
    if Z.shape == (Z.shape[0],):
        Z = Z.reshape((Z.shape[0], 1))
        
    N = Z.shape[0]
    k = Z.shape[1]
    
    eps = y - X * betaH0
    sigma_eps_hat = (1 / (N - k)) * eps.T @ orthogonal_projection_matrix(Z) @ eps
    sigma_eps_V_hat = (1 / (N - k)) * eps.T @ orthogonal_projection_matrix(Z) @ X
    sigma_V_eps_hat = (1 / (N - k)) * X.T @ orthogonal_projection_matrix(Z) @ eps
    sigma_V_hat = (1 / (N - k)) * X.T @ orthogonal_projection_matrix(Z) @ X
    
    rho_hat = sigma_eps_V_hat / sigma_eps_hat
    pi_sim = inv(Z.T @ Z) @ Z.T @ (X - eps * rho_hat)
    
    return pi_sim, sigma_eps_hat, sigma_eps_V_hat, sigma_V_eps_hat, sigma_V_hat
    
def k_AR_r_LM_beta(y, X, betaH0, Z):
    
    if Z.shape == (Z.shape[0],):
        Z = Z.reshape((Z.shape[0], 1))
    
    
    pi, sigma_eps_hat, sigma_eps_V_hat, sigma_V_eps_hat, sigma_V_hat = pi_sim(y, X, betaH0, Z)
 
    eps = y - X * betaH0
    
    sigma_hat_VV_dot_eps = sigma_V_hat - (sigma_V_eps_hat * sigma_eps_V_hat) / sigma_eps_hat
    
    r_beta = (1 / sigma_hat_VV_dot_eps) * pi.T @ (Z.T @ Z) @ pi
    LM_beta = (1 / sigma_eps_hat) * eps.T @ projection_matrix(Z @ pi) @ eps
    k_AR_beta = (1 / sigma_eps_hat) * eps.T @ projection_matrix(Z) @ eps
      
    return k_AR_beta, r_beta, LM_beta

def LR_beta(y, X, betaH0, Z):
    k_AR_beta, r_beta, LM_beta = k_AR_r_LM_beta(y, X, betaH0, Z)
    
    return (0.5) * (k_AR_beta - r_beta + np.sqrt((k_AR_beta + r_beta) ** 2 - 4 * r_beta * (k_AR_beta - LM_beta)))

## Question 1

In [None]:
#create data
rho_list = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
a_list = [1, 0.6, 0.3, 0.15, 0.07, 0.04, 0.02, 0]
N = 100
k = 10
MC = 5000
Z = np.random.normal(loc=0, scale=1, size=(N, k))
beta_0 = 0
results = np.zeros(MC)
R_freq = np.zeros([len(rho_list), len(a_list)])


for a in a_list:
    for rho in rho_list:
        for l in range(MC):
            np.random.seed(1211)
            pi = np.zeros(k)
            pi[0] = a
            sigma = np.array([[1, rho], [rho, 1]])
            eps_v = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=(N))
            eps = eps_v[:, 0]
            v = eps_v[:, 1]

            X = Z @ pi + v
            Y = X * beta_0 + eps
            Pz = Z @ np.linalg.inv(Z.T @ Z) @ Z.T


            beta_2SLS = (X.T @ Pz @ X) ** (-1) * (X.T @ Pz @ Y)
            residuals = Y - X.T * beta_2SLS
            Shat = np.zeros((10, 10))
            for i in range(Z.shape[1]):
                Shat += (N-k) ** (-1) * residuals[i] ** 2 * (np.outer(Z[i, ], Z[i, ]))

            beta_2SLS_var = (N-1) ** (-1) * (Y - X * beta_2SLS).T @ (Y - X * beta_2SLS) *\
                            (X.T @ Pz @ X) ** (-1)

            tstat = (beta_2SLS - beta_0) / np.sqrt(beta_2SLS_var)
            results[l] = (tstat < -1.96) | (tstat > 1.96)
            
        R_freq[rho_list.index(rho), a_list.index(a)] = round(np.mean(results), 3)

In [None]:
df = pd.DataFrame(R_freq, rho_list)
df.columns = a_list
#df.set_index(rho_list)
print(df)


for f in range(8):
    plt.plot(R_freq[:, f])
    plt.ylabel('rej frequency')
    plt.title('a is ' + str(a_list[f]))
    plt.xticks(np.arange(11),rho_list)
    plt.show()
    


## Question 2

In [None]:
k = 10
MC = 5000

r_beta = np.arange(0, 200.25, 0.25)

crit_val_95 = pd.DataFrame({'r_beta' : [], 'crit_val' : []})

for r in r_beta:
    np.random.seed(1211)
    vidle_1 = np.random.chisquare(df=1, size=([MC, 1]))
    vidle_k = np.random.chisquare(df=k - 1, size=([MC, 1]))
    
    LR = 0.5 * (vidle_k + vidle_1 - r + np.sqrt((vidle_k + vidle_1 + r) ** 2 - 4 * r * vidle_k))
    
    LR_crit_val = np.percentile(LR, q=95, interpolation='higher')
    
    crit_val_95 = crit_val_95.append(pd.DataFrame({'r_beta' : [r], 'crit_val' : [LR_crit_val]}), ignore_index=True)
    


In [None]:
crit_val_95.plot(x='r_beta', y='crit_val')
plt.title('Critical value function')
plt.xlabel('Value of r_beta')
plt.ylabel('95% critical value')
plt.savefig('CV_2.png',dpi=1000)
plt.show()

## Question 3

In [None]:
%%time
np.random.seed(1211)
rho_list = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
a_list = [1, 0.6, 0.3, 0.15, 0.07, 0.04, 0.02, 0]
N = 100
k = 10
MC = 5000
Z = np.random.normal(loc=0, scale=1, size=(N, k))
beta_0 = 0
LR_reject = np.zeros(MC)
r_beta = np.zeros(MC)
R_freq = pd.DataFrame({'r_beta': [], 'LR' : []})

results_ar = np.zeros(MC)
R_freq_AR = np.zeros([len(rho_list), len(a_list)])
results_sc = np.zeros(MC)
R_freq_SC = np.zeros([len(rho_list), len(a_list)])
results_lr = np.zeros(MC)
R_freq_LR = np.zeros([len(rho_list), len(a_list)])

for a in a_list:
    for rho in rho_list:
         
        vidle_1 = (np.random.chisquare(df=1, size=(1000)))
        vidle_k = (np.random.chisquare(df=k - 1, size=(1000)))
#         r = k_AR_r_LM_beta(Y, X, beta_0, Z)[1]
#         LR_MC = 0.5 * (vidle_k + vidle_1 - r + np.sqrt((vidle_k + vidle_1 + r) ** 2 - 4 * r * vidle_k))
        
            
        for l in range(MC):
            pi = np.zeros(k)
            pi[0] = a
            sigma = np.array([[1, rho], [rho, 1]])
            eps_v = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=(N))
            eps = eps_v[:, 0]
            v = eps_v[:, 1]

            X = Z @ pi + v
            Y = X * beta_0 + eps
            
            #AR stat
            arstat = k_AR_r_LM_beta(Y, X, beta_0, Z)[0]
            results_ar[l] = arstat > (18.3)
            R_freq_AR[rho_list.index(rho), a_list.index(a)] = round(np.mean(results_ar), 3)
            
            #LM/score stat
            scstat = k_AR_r_LM_beta(Y, X, beta_0, Z)[2]
            results_sc[l] = scstat > 3.84
            R_freq_SC[rho_list.index(rho), a_list.index(a)] = round(np.mean(results_sc), 3)
            
            #LR stat
            r = k_AR_r_LM_beta(Y, X, beta_0, Z)[1]
            LR_MC = 0.5 * (vidle_k + vidle_1 - r + np.sqrt((vidle_k + vidle_1 + r) ** 2 - 4 * r * vidle_k))
        
            lrstat = LR_beta(Y, X, beta_0, Z)
            results_lr[l] = lrstat >  np.percentile(LR_MC, q=95, interpolation='higher')
            R_freq_LR[rho_list.index(rho), a_list.index(a)] = round(np.mean(results_lr), 3)
            
   
            
        #R_freq = R_freq.append(pd.DataFrame({'r_beta' : [k_AR_r_LM_beta(Y, X, beta_0, Z)[1]], 'LR' : np.mean(LR_reject)}),
        #                     ignore_index=True)
    


In [None]:
#SC
dfSC = pd.DataFrame(R_freq_SC, rho_list)
dfSC.columns = a_list
print(dfSC)

R_freqSCt = R_freq_SC.T
num_plots = R_freqSCt.shape[0]
# colormap = plt.cm.gist_ncar
# plt.gca().set_color_cycle([colormap(i)
#                            for i in np.linspace(0, 0.9, num_plots)])

labels = []
for i in range(num_plots):
    plt.plot(rho_list, R_freqSCt[i, :])
    labels.append(' a = ' + str(a_list[i]))

plt.legend(labels, ncol=4, loc='lower center', 
           columnspacing=1.0, labelspacing=0.0,
           handletextpad=0.0, handlelength=1.5,
           fancybox=True, shadow=True)
plt.ylim( (0.02, 0.12) )
plt.title('Score Test')
plt.xlabel('value of rho')
plt.ylabel('rejection frequency')
plt.savefig('SC_3.png', dpi=1000)
plt.show()


#AR
dfAR = pd.DataFrame(R_freq_AR, rho_list)
dfAR.columns = a_list
print(dfAR)

R_freqARt = R_freq_AR.T
num_plots = R_freqARt.shape[0]
#colormap = plt.cm.gist_ncar
#plt.gca().set_prop_cycle([plt.cm.spectral(i) for i in np.linspace(0, 1, 8)])
#plt.set_prop_cycle('color',plt.cm.spectral(np.linspace(0,1,8)))

#ax.set_prop_cycle('color',plt.cm.spectral(np.linspace(0,1,30)))

labels = []
for i in range(num_plots):
    plt.plot(rho_list, R_freqARt[i, :])
    labels.append(' a = ' + str(a_list[i]))

plt.legend(labels, ncol=4, loc='lower center', 
           columnspacing=1.0, labelspacing=0.0,
           handletextpad=0.0, handlelength=1.5,
           fancybox=True, shadow=True)
plt.ylim( (0.02, 0.12) )
plt.title('Anderson Rubin Test')
plt.xlabel('value of rho')
plt.ylabel('rejection frequency')
plt.savefig('AR_3', dpi=1000)
plt.show()   


#LR
dfLR = pd.DataFrame(R_freq_LR, rho_list)
dfLR.columns = a_list
print(dfLR)

R_freqLRt = R_freq_LR.T
num_plots = R_freqLRt.shape[0]
# colormap = plt.cm.gist_ncar
# plt.gca().set_color_cycle([colormap(i)
#                            for i in np.linspace(0, 0.9, num_plots)])

labels = []
for i in range(num_plots):
    plt.plot(rho_list, R_freqLRt[i, :])
    labels.append(' a = ' + str(a_list[i]))

plt.legend(labels, ncol=4, loc='lower center', 
           columnspacing=1.0, labelspacing=0.0,
           handletextpad=0.0, handlelength=1.5,
           fancybox=True, shadow=True)
plt.ylim( (0, 0.125) )
plt.title('Likelihood Ratio Test')
plt.xlabel('value of rho')
plt.ylabel('rejection frequency')
plt.savefig('LR_3.png',dpi=1000)
plt.show()

## Question 4

In [None]:
k = 4
MC = 5000

r_beta = np.arange(0, 200.25, 0.25)

crit_val_95 = pd.DataFrame({'r_beta' : [], 'crit_val' : []})

for r in r_beta:
    np.random.seed(1211)
    vidle_1 = np.random.chisquare(df=1, size=([MC, 1]))
    vidle_k = np.random.chisquare(df=k - 1, size=([MC, 1]))
    
    LR = 0.5 * (vidle_k + vidle_1 - r + np.sqrt((vidle_k + vidle_1 + r) ** 2 - 4 * r * vidle_k))
    
    LR_crit_val = np.percentile(LR, q=95, interpolation='higher')
    
    
    
    crit_val_95 = crit_val_95.append(pd.DataFrame({'r_beta' : [r], 'crit_val' : [LR_crit_val]}), ignore_index=True)
    


In [None]:
crit_val_95.plot(x='r_beta', y='crit_val')
plt.title('Critical value function')
plt.xlabel('Value of r_beta')
plt.ylabel('95% critical value')
plt.savefig('CV_4.png',dpi=1000)
plt.show()

## Question 5

In [None]:
import pandas as pd
import numpy as np
import statsmodels
import matplotlib.pyplot as plt
from numpy.linalg import inv as inv
import scipy.stats as stats

In [None]:
data = pd.read_csv('dest.csv', 
                header=None, 
                names=['age', 'age2', 'ed',
                       'exper', 'exper2', 'nearc2',
                       'nearc4', 'nearc4a', 'nearc4b',
                       'race', 'smsa', 'south', 'wage'])

In [None]:
ys = data.loc[:, 'wage'].values
Xs = data.loc[:, 'ed'].values
#Z = data.loc[:, ['nearc2', 'nearc4', 'nearc4a', 'nearc4b']]
W_noC = data.loc[:, ['exper', 'exper2', 'south', 'smsa', 'race']]
constant = pd.DataFrame({'constant' : np.ones(W_noC.shape[0])})
W = pd.concat([W_noC, constant], axis=1).values

#### Subquestion a

In [None]:
Zs = data.loc[:, 'nearc2'].values
Zs = Zs.reshape((Zs.shape[0], 1))

Mw = orthogonal_projection_matrix(W)

y = Mw @ ys
X = Mw @ Xs
Z = Mw @ Zs

In [None]:
%%time

N = Z.shape[0]
k = Z.shape[1]
beta = list(np.arange(-2, 1.01, 0.01))
beta = [round(b, 3) for b in beta]

AR_crit_val = np.repeat(stats.chi2.ppf(q=0.95, df=k) / k, len(beta))
AR_stat = np.zeros(len(beta))
t_stat = np.zeros(len(beta))

beta_2SLS = (X.T @ projection_matrix(Z) @ X) ** (-1) * (X.T @ projection_matrix(Z) @ y)
beta_2SLS_var = (N - 1) ** (-1) * (y - X * beta_2SLS).T @ (y - X * beta_2SLS) *\
                            (X.T @ projection_matrix(Z) @ X) ** (-1)

t_crit_val_lower = np.repeat(beta_2SLS - 1.96 * np.sqrt(beta_2SLS_var), len(beta))
t_crit_val_upper = np.repeat(beta_2SLS + 1.96 * np.sqrt(beta_2SLS_var), len(beta))


for b in beta:    
    np.random.seed(1211)
    
    eps = y - X * b
    AR_stat[beta.index(b)] = ((eps.T @ projection_matrix(Z) @ eps) / k) / ((eps.T @ orthogonal_projection_matrix(Z) @ eps) / (N - k))
    t_stat[beta.index(b)] = (beta_2SLS - b) / np.sqrt(beta_2SLS_var)


In [None]:
plt.plot(beta, AR_crit_val, 'tab:blue', label='AR_critical')
plt.plot(beta, AR_stat, 'tab:orange', label='AR_stat')
plt.plot(beta, np.repeat(0, AR_stat.shape[0]), 'xkcd:black')
plt.title('Critical set of AR statistic')
plt.xlabel('Value of beta')
plt.ylabel('Statistic value')
plt.savefig('5_a.png',dpi=1000)
plt.legend()
plt.show()

In [None]:
plt.plot(beta, t_stat, 'tab:orange', label='t_stat')
plt.plot(beta, t_crit_val_lower, 'tab:blue', label='t_crit_val')
plt.plot(beta, t_crit_val_upper, 'tab:blue', label='t_crit_val')
plt.xlim((0.1, 0.5))
plt.ylim((t_crit_val_lower[0] - 0.5, t_crit_val_upper[0] + 0.5))
plt.title('Test statistic of t-test')
plt.xlabel('Value of beta')
plt.ylabel('Statistic value')
plt.legend()
plt.savefig('5_a_t.png', dpi=1000)
plt.show()

#### Subquestion c

In [None]:
%%time

beta = list(np.arange(0, 150, 1))
beta = [round(b, 1) for b in beta]

AR_crit_val = np.repeat(stats.chi2.ppf(q=0.95, df=k) / k, len(beta))
AR_stat = np.zeros(len(beta))
F_stat = np.zeros(len(beta))

pi_hat = inv(Z.T @ Z) @ Z.T @ X
sigma_vv_hat = (X.T @ orthogonal_projection_matrix(Z) @ X) / (N - k)
F_stat = (pi_hat @ Z.T @ Z @ pi_hat) / (k * sigma_vv_hat)

for b in beta:
    eps = y - X * b
    
    AR_stat[beta.index(b)] = ((eps.T @ projection_matrix(Z) @ eps) / k) / ((eps.T @ orthogonal_projection_matrix(Z) @ eps) / (N - k))

In [None]:
plt.plot(beta, AR_crit_val, 'tab:blue')
plt.plot(beta, AR_stat, 'tab:orange')
plt.legend()
plt.show()

In [None]:
F_stat

#### Subquestion e

In [None]:
Zs = data.loc[:, ['nearc4', 'nearc2', 'nearc4a', 'nearc4b']].values

Mw = orthogonal_projection_matrix(W)

y = Mw @ ys
X = Mw @ Xs
Z = Mw @ Zs

In [None]:
%%time

N = Z.shape[0]
k = Z.shape[1]

beta = list(np.arange(-1, 1.05, 0.05))
beta = [round(b, 2) for b in beta]
MC = 5000

AR_crit_val = np.repeat(stats.chi2.ppf(q=0.95, df=k) / k, len(beta))
LM_crit_val = np.repeat(stats.chi2.ppf(q=0.95, df=1), len(beta))
AR_stat = np.zeros(len(beta))
t_stat = np.zeros(len(beta))
LM_stat = np.zeros(len(beta))
LR_stat = np.zeros([len(beta), 1])
LR_crit_val = np.zeros([len(beta), 1])

beta_2SLS = (X.T @ projection_matrix(Z) @ X) ** (-1) * (X.T @ projection_matrix(Z) @ y)
beta_2SLS_var = (N - 1) ** (-1) * (y - X * beta_2SLS).T @ (y - X * beta_2SLS) *\
                            (X.T @ projection_matrix(Z) @ X) ** (-1)

t_crit_val_lower = np.repeat(beta_2SLS - 1.96 * np.sqrt(beta_2SLS_var), len(beta))
t_crit_val_upper = np.repeat(beta_2SLS + 1.96 * np.sqrt(beta_2SLS_var), len(beta))
    
    
for b in beta:    
    np.random.seed(1211)
    
    eps = y - X * b
    t_stat[beta.index(b)] = (beta_2SLS - b) / np.sqrt(beta_2SLS_var)
    AR_stat[beta.index(b)] = ((eps.T @ projection_matrix(Z) @ eps) / k) / ((eps.T @ orthogonal_projection_matrix(Z) @ eps) / (N - k))
    
    
    r_beta = k_AR_r_LM_beta(y, X, b, Z)[1]
    LM_stat[beta.index(b)] = k_AR_r_LM_beta(y, X, b, Z)[2]
    
    vidle_1 = np.random.chisquare(df=1, size=([MC, 1]))
    vidle_k = np.random.chisquare(df=k - 1, size=([MC, 1]))
    
    LR = 0.5 * (vidle_k + vidle_1 - r_beta + np.sqrt((vidle_k + vidle_1 + r_beta) ** 2 - 4 * r_beta * vidle_k))
    
    LR_crit_val[beta.index(b)] = np.percentile(LR, q=95, interpolation='higher')
    
    vidle_1 = np.random.chisquare(df=1, size=([1, 1]))
    vidle_k = np.random.chisquare(df=k - 1, size=([1, 1]))
    
    LR_stat[beta.index(b)] = LR_beta(y, X, b, Z)


In [None]:
plt.plot(beta, LM_crit_val, 'tab:blue', label='LM_crit_val')
plt.plot(beta, LM_stat, 'tab:orange', label='LM_stat')
plt.plot(beta, np.zeros(len(beta)), 'xkcd:black')
plt.title('Test statistic of LM-test')
plt.xlabel('Value of beta')
plt.ylabel('Statistic value')
plt.legend()
plt.savefig('5_e_LM.png', dpi=1000)
plt.show()

In [None]:
plt.plot(beta, t_stat, 'tab:orange', label='t_stat')
plt.plot(beta, t_crit_val_lower, 'tab:blue', label='t_crit_val')
plt.plot(beta, t_crit_val_upper, 'tab:blue', label='t_crit_val')
plt.xlim((0.13, 0.17))
plt.ylim((t_crit_val_lower[0] - 0.5, t_crit_val_upper[0] + 0.5))
plt.title('Test statistic of t-test')
plt.xlabel('Value of beta')
plt.ylabel('Statistic value')
plt.legend()
plt.savefig('5_e_t.png', dpi=1000)
plt.show()

#### Subquestion g

In [None]:
OIR = min(AR_stat)
CV = stats.chi2.ppf(q=0.95, df=3)

OIR < CV