In [1]:
import numpy as np
import pandas as pd
import numpy.linalg as lin
import scipy.stats as sts
import scipy.integrate as intgr
import scipy.optimize as opt
import matplotlib
import matplotlib.pyplot as plt

In [None]:
#Q1

In [None]:
#a

In [2]:
income = np.loadtxt('/Users/fujie/Desktop/incomes.txt')

In [3]:
%matplotlib notebook
num_bins = 30
count, bins, ignored = plt.hist(income, num_bins, normed=True,
                                edgecolor='k')
plt.title('Income: 2018-2020', fontsize=20)
plt.xlabel(r'Income')
plt.ylabel(r'Percent')
plt.xlim([0,150000])

<IPython.core.display.Javascript object>

(0, 150000)

In [None]:
#b

In [5]:
def trunc_lognorm_pdf(xvals, mu, sigma, cut_lb, cut_ub):

    if cut_ub == 'None' and cut_lb == 'None':
        prob_notcut = 1.0
    elif cut_ub == 'None' and cut_lb != 'None':
        prob_notcut = 1.0 - sts.lognorm.cdf(cut_lb, sigma, scale=np.exp(mu))
    elif cut_ub != 'None' and cut_lb == 'None':
        prob_notcut = sts.lognorm.cdf(cut_ub, sigma,scale=np.exp(mu))
    elif cut_ub != 'None' and cut_lb != 'None':
        prob_notcut = (sts.lognorm.cdf(cut_ub, sigma,scale=np.exp(mu)) -
                       sts.lognorm.cdf(cut_lb, sigma,scale=np.exp(mu)))
            
    pdf_vals    = ((1/(xvals*sigma * np.sqrt(2 * np.pi)) *
                    np.exp( - (np.log(xvals) - mu)**2 / (2 * sigma**2))) /
                    prob_notcut)
    
    return pdf_vals

In [6]:
def data_moments(xvals):

    mean_data = xvals.mean()
    std_data = xvals.std()
    
    return mean_data, std_data

In [7]:
def model_moments(mu, sigma, cut_lb, cut_ub):
   
    xfx = lambda x: x * trunc_lognorm_pdf(x, mu, sigma, cut_lb, cut_ub)
    (mean_model, m_m_err) = intgr.quad(xfx, cut_lb, cut_ub)
    x2fx = lambda x: ((x - mean_model) ** 2) * trunc_lognorm_pdf(x, mu, sigma, cut_lb, cut_ub) 
    (var_model, v_m_err) = intgr.quad(x2fx, cut_lb, cut_ub)
    
    return mean_model, np.sqrt(var_model)

In [8]:
def err_vec(xvals, mu, sigma, cut_lb, cut_ub, simple):

    mean_data, std_data = data_moments(xvals)
    moms_data = np.array([[mean_data], [std_data]])
    mean_model, std_model = model_moments(mu, sigma, cut_lb, cut_ub)
    moms_model = np.array([[mean_model], [std_model]])
    if simple:
        err_vec = moms_model - moms_data
    else:
        err_vec = (moms_model - moms_data) / moms_data
    
    return err_vec

In [9]:
def criterion(params, *args):
   
    mu, sigma = params
    xvals, cut_lb, cut_ub, W = args
    err = err_vec(xvals, mu, sigma, cut_lb, cut_ub, simple=False)
    crit_val = err.T @ W @ err
    
    return crit_val

In [10]:
mu_init = 11
sig_init = 0.5
params_init = np.array([mu_init, sig_init])
W_hat = np.eye(2)
gmm_args = (income, 0.0, 150000.0, W_hat)
results = opt.minimize(criterion, params_init, args=(gmm_args), tol=1e-14,
                       method='L-BFGS-B', bounds=((1e-10, None), (1e-10, None)))
mu_GMM1, sig_GMM1 = results.x
print('mu_GMM1=', mu_GMM1, ' sig_GMM1=', sig_GMM1)


mu_GMM1= 11.333599613401052  sig_GMM1= 0.2132552950928067


In [11]:
mean_data, std_data = data_moments(income)
mean_model, std_model = model_moments(mu_GMM1, sig_GMM1, 0.0, 150000.0)
err1 = err_vec(income, mu_GMM1, sig_GMM1, 0.0, 150000.0, False).reshape(2,)
print('Mean of points =', mean_data, ', Standard Deviation of points =', std_data)
print('Mean of model =', mean_model, ', Standard Deviation =', std_model)
print('Error vector=', err1)
print('GMM criterion function:',results.fun)

Mean of points = 85276.82360625811 , Standard Deviation of points = 17992.542128046523
Mean of model = 85276.82405854747 , Standard Deviation = 17992.541683009
Error vector= [ 5.3037782e-09 -2.4734555e-08]
GMM criterion function: [[6.39928276e-16]]


In [13]:
num_bins = 30
plt.hist(income, num_bins, normed=True, edgecolor='k')
plt.title('Annual Incomes of MACSS Graduates: 2018-2020', fontsize=15)
plt.xlabel('Annual Income')
plt.ylabel('Percent of Incomes')
plt.xlim([1, 150000])

dist_income = np.linspace(0, 150000, 1000)
plt.plot(dist_income, trunc_lognorm_pdf(dist_income, mu_GMM1, sig_GMM1, 0.0, 150000),
         linewidth=2, color='r', label='1: $\mu_{GMM1}$,$\sigma_{GMM1}$')
plt.legend(loc='upper left')
plt.show()

<IPython.core.display.Javascript object>

  del sys.path[0]
  
  


In [None]:
#c

In [16]:
def get_Err_mat2(pts, mu, sigma, cut_lb, cut_ub, simple=False):
    
    R = 2
    N = len(pts)
    Err_mat = np.zeros((R, N))
    mean_model, std_model = model_moments(mu, sigma, cut_lb, cut_ub)
    if simple:
        Err_mat[0, :] = pts - mean_model
        Err_mat[1, :] = ((mean_data - pts) ** 2) - std_model
    else:
        Err_mat[0, :] = (pts - mean_model) / mean_model
        Err_mat[1, :] = (((mean_data - pts) ** 2) - std_model) / std_model
    
    return Err_mat

In [19]:
Err_mat =get_Err_mat2(income, mu_GMM1, sig_GMM1, 0.0, 150000.0, False)
VCV2 = (1 / income.shape[0]) * (Err_mat @ Err_mat.T)
print("VCV2:")
print(VCV2)
W_hat2 = lin.inv(VCV2)
print("W_hat2:")
print(W_hat2)

VCV2:
[[4.45167060e-02 1.68385288e+03]
 [1.68385288e+03 9.54184437e+08]]
W_hat2:
[[ 2.40701668e+01 -4.24767141e-05]
 [-4.24767141e-05  1.12297423e-09]]


In [28]:
gmm_args = (income, 0.0, 150000.0, W_hat2)
params_init = np.array([mu_GMM1, sig_GMM1])
results2 = opt.minimize(criterion, params_init, args=(gmm_args), tol=1e-14,
                       method='L-BFGS-B', bounds=((1e-10, None), (1e-10, None)))
mu_GMM2, sig_GMM2 = results2.x
print('mu_GMM2=', mu_GMM2, ' sig_GMM2=', sig_GMM2)

print("mu_GMM1= 11.333599613401052  sig_GMM1= 0.2132552950928067")

mean_model2, std_model2 = model_moments(mu_GMM2, sig_GMM2, 0.0, 150000.0)
err2 = err_vec(income, mu_GMM2, sig_GMM2, 0.0, 150000.0, False).reshape(2,)
print('Mean of points =', mean_data, ', Standard Deviation of points =', std_data)
print('Mean of model =', mean_model2, ', Standard Deviation of model =', std_model2)
print('Error vector=', err2)
print('GMM criterion function:',results2.fun)

mu_GMM2= 11.333599603618694  sig_GMM2= 0.21325529447620906
mu_GMM1= 11.333599613401052  sig_GMM1= 0.2132552950928067
Mean of points = 85276.82360625811 , Standard Deviation of points = 17992.542128046523
Mean of model = 85276.8232464362 , Standard Deviation of model = 17992.541506074136
Error vector= [-4.21945734e-09 -3.45683441e-08]
GMM criterion function: [[4.28528532e-16]]


In [27]:
num_bins = 30
plt.hist(income, num_bins, normed=True, edgecolor='k')
plt.title('Annual Incomes of MACSS Graduates: 2018-2020', fontsize=15)
plt.xlabel('Annual Income')
plt.ylabel('Percent of Incomes')
plt.xlim([1, 150000])

plt.plot(dist_income, trunc_lognorm_pdf(dist_income, mu_GMM1, sig_GMM1, 0.0, 150000),
         linewidth=2, color='r', label='1: $\mu_{GMM1}$,$\sigma_{GMM1}$')
plt.legend(loc='upper left')
plt.show()

plt.plot(dist_income, trunc_lognorm_pdf(dist_income, mu_GMM2, sig_GMM2, 0.0, 150000),
         linewidth=2, color='g', label='1: $\mu_{GMM2}$,$\sigma_{GMM2}$')
plt.legend(loc='upper left')
plt.show()

<IPython.core.display.Javascript object>

  del sys.path[0]
  
  


In [None]:
#d

In [29]:
def data_moments3(xvals):
    
    bpct_1_dat = xvals[xvals <75000].shape[0] / xvals.shape[0]
    bpct_2_dat = (xvals[(xvals >=75000) & (xvals < 100000)].shape[0] /
                  xvals.shape[0])
    bpct_3_dat = xvals[xvals >= 100000].shape[0] / xvals.shape[0]

    return bpct_1_dat, bpct_2_dat, bpct_3_dat

In [30]:
def model_moments3(mu, sigma, cut_lb, cut_ub):
   
    xfx = lambda x: trunc_lognorm_pdf(x, mu, sigma, cut_lb, cut_ub)
    (bpct_1_mod, bp_1_err) = intgr.quad(xfx, 0.0, 75000)
    (bpct_2_mod, bp_2_err) = intgr.quad(xfx, 75000, 100000)
    (bpct_3_mod, bp_3_err) = intgr.quad(xfx, 100000, 150000)
    
    return bpct_1_mod, bpct_2_mod, bpct_3_mod

In [31]:
def err_vec3(xvals, mu, sigma, cut_lb, cut_ub, simple):

    bpct_1_dat, bpct_2_dat, bpct_3_dat = data_moments3(xvals)
    moms_data = np.array([[bpct_1_dat], [bpct_2_dat], [bpct_3_dat]])
    bpct_1_mod, bpct_2_mod, bpct_3_mod = model_moments3(mu, sigma, cut_lb, cut_ub)
    moms_model = np.array([[bpct_1_mod], [bpct_2_mod], [bpct_3_mod]])
    if simple:
        err_vec = moms_model - moms_data
    else:
        err_vec = (moms_model - moms_data) / moms_data
    
    return err_vec

In [32]:
def criterion3(params, *args):

    mu, sigma = params
    xvals, cut_lb, cut_ub, W = args
    err = err_vec3(xvals, mu, sigma, cut_lb, cut_ub, simple=False)
    crit_val = err.T @ W @ err 
    
    return crit_val

In [33]:
W_hat1_3 = np.eye(3)
gmm_args = (income, 0.0, 150000.0, W_hat1_3)

results_3 = opt.minimize(criterion3, params_init, args=(gmm_args),
                       method='L-BFGS-B', bounds=((1e-10, None), (1e-10, None)))
mu_GMM1_3, sig_GMM1_3 = results_3.x
print('mu_GMM1_3=', mu_GMM1_3, ' sig_GMM1_3=', sig_GMM1_3)
err3 = err_vec3(income, mu_GMM1_3, sig_GMM1_3, 0.0, 150000.0, False).reshape(3,)
bpct_1_dat, bpct_2_dat, bpct_3_dat=data_moments3(income)
pert1, pert2, pert3 = model_moments3(mu_GMM1_3, sig_GMM1_3, 0.0, 150000.0)

print('Error vector=', err3)
print('GMM criterion function:',results_3.fun)
data_model_moments=pd.DataFrame({'Data Moments':[bpct_1_dat, bpct_2_dat, bpct_3_dat],
                'Model Momets':[pert1, pert2, pert3]},
                index=['Percent less than 75000','Percent between 75000 and 100000','Percent more than 100000'])
data_model_moments

mu_GMM1_3= 11.336705179923449  sig_GMM1_3= 0.21151351610769975
Error vector= [ 1.13768317e-08  1.18261299e-08 -4.66305708e-08]
GMM criterion function: [[2.44369978e-15]]


Unnamed: 0,Data Moments,Model Momets
Percent less than 75000,0.3,0.3
Percent between 75000 and 100000,0.5,0.5
Percent more than 100000,0.2,0.2


In [35]:
num_bins = 30
plt.hist(income, num_bins, normed=True, edgecolor='k')
plt.title('Annual Incomes of MACSS Graduates: 2018-2020', fontsize=15)
plt.xlabel('Annual Income')
plt.ylabel('Income Percent')
plt.xlim([0, 150000])

plt.plot(dist_income, trunc_lognorm_pdf(dist_income, mu_GMM1_3, sig_GMM1_3, 0.0, 150000),
         linewidth=2, color='r', label='1: $\mu_{GMM3}$,$\sigma_{GMM3}$')
plt.legend(loc='upper left')

plt.show()

<IPython.core.display.Javascript object>

  del sys.path[0]
  
  


In [None]:
#e

In [37]:
def get_Err_mat3(pts, mu, sigma, cut_lb, cut_ub, simple=False):

    R = 3
    N = len(pts)
    Err_mat = np.zeros((R, N))
    pct_1_mod, pct_2_mod, pct_3_mod = \
        model_moments3(mu, sigma, cut_lb, cut_ub)
    if simple:
        pts_in_grp1 = pts < 75000
        Err_mat[0, :] = pts_in_grp1 - pct_1_mod
        pts_in_grp2 = (pts >= 75000) & (pts < 100000)
        Err_mat[1, :] = pts_in_grp2 - pct_2_mod
        pts_in_grp3 = pts >= 100000
        Err_mat[2, :] = pts_in_grp3 - pct_3_mod

    else:
        pts_in_grp1 = pts < 75000
        Err_mat[0, :] = (pts_in_grp1 - pct_1_mod) / pct_1_mod
        pts_in_grp2 = (pts >= 75000) & (pts < 100000)
        Err_mat[1, :] = (pts_in_grp2 - pct_2_mod) / pct_2_mod
        pts_in_grp3 = pts >= 100000
        Err_mat[2, :] = (pts_in_grp3 - pct_3_mod) / pct_3_mod
    
    return Err_mat

In [51]:
Err_mat32 = get_Err_mat3(income, mu_GMM1_3, sig_GMM1_3, 0.0, 150000.0, False)
VCV_32 = (1 / income.shape[0]) * (Err_mat32 @ Err_mat32.T)
print("VCV:")
print(VCV_32)
W_hat_32 = lin.pinv(VCV_32)
print("W_hat")
print(W_hat_32)


VCV:
[[ 2.33333328 -0.99999998 -1.00000004]
 [-0.99999998  0.99999998 -1.00000003]
 [-1.00000004 -1.00000003  4.00000037]]
W_hat
[[ 0.25761773 -0.14958449 -0.01246537]
 [-0.14958449  0.11911357 -0.07340719]
 [-0.01246537 -0.07340719  0.20221605]]


In [39]:
gmm_args = (income, 0.0, 150000.0, W_hat_32)
results_4 = opt.minimize(criterion3, params_init, args=(gmm_args),method='L-BFGS-B')
mu_GMM1_3w, sig_GMM1_3w = results_4.x
print('mu_GMM1_3w=', mu_GMM1_3w, ' sig_GMM1_3w=', sig_GMM1_3w)
err3w = err_vec3(income, mu_GMM1_3w, sig_GMM1_3w, 0.0, 150000.0, False).reshape(3,)

pert1, pert2, pert3 = model_moments3(mu_GMM1_3w, sig_GMM1_3w, 0.0, 150000.0)

print('Error vector:', err3w)
print('GMM criterion function value:',results_4.fun)
Data_Model_Moment2=pd.DataFrame({'Data Moments':[bpct_1_dat, bpct_2_dat, bpct_3_dat],
                'Model Momets':[pert1, pert2, pert3]},
                index=['Percent less than 75000','Percent between 75000 and 100000','Percent more than 100000'])
Data_Model_Moment2

  the requested tolerance from being achieved.  The error may be 
  underestimated.
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


mu_GMM1_3w= 11.333599613401052  sig_GMM1_3w= 0.2132552950928067
Error vector: [ 0.02208416 -0.00875996 -0.01122634]
GMM criterion function value: [[0.00020989]]


Unnamed: 0,Data Moments,Model Momets
Percent less than 75000,0.3,0.306625
Percent between 75000 and 100000,0.5,0.49562
Percent more than 100000,0.2,0.197755


In [42]:
num_bins = 30
plt.hist(income, num_bins, normed=True, edgecolor='k')
plt.title('Annual Incomes of MACSS Graduates: 2018-2020', fontsize=15)
plt.xlabel('Annual Income')
plt.ylabel('Income Percent')
plt.xlim([1, 150000])

plt.plot(dist_income, trunc_lognorm_pdf(dist_income, mu_GMM1_3w, sig_GMM1_3w, 0.0, 150000),
         linewidth=2, color='r', label='1: $\mu_{GMM3w}$,$\sigma_{GMM3w}$')
plt.legend(loc='upper left')
plt.show()

<IPython.core.display.Javascript object>

  del sys.path[0]
  
  


In [None]:
#f

In [48]:
mu=[mu_GMM1,mu_GMM2,mu_GMM1_3,mu_GMM1_3w]
sig=[sig_GMM1,sig_GMM2,sig_GMM1_3,sig_GMM1_3w]
GMM_Value=[results.fun,results2.fun,results_3.fun,results_4.fun]

In [49]:
pd.DataFrame({"mu":mu,"sigma":sig,"GMM Value":GMM_Value}
            ,index=['2 moments Identity Matrix','2 moments Weighted Matrix',
                   '3 moments Identity Matrix','3 moments Weighted Matrix']).astype(float)

Unnamed: 0,GMM Value,mu,sigma
2 moments Identity Matrix,6.399283e-16,11.3336,0.213255
2 moments Weighted Matrix,4.285285e-16,11.3336,0.213255
3 moments Identity Matrix,2.4437e-15,11.336705,0.211514
3 moments Weighted Matrix,0.0002098877,11.3336,0.213255


In [None]:
#The second model fits the data best because it has the smallest GMM value

In [None]:
#Q2

In [68]:
import pandas as pd
sick = pd.read_csv('/Users/fujie/Desktop/sick.txt').astype('float64')

In [72]:
def err_vec_s(df,b0,b1,b2,b3):
    
    y_h=b0+b1*sick['age']+b2*sick['children']+b3*sick['avgtemp_winter']

    err_vec = y_h - sick['sick']
    
    err_vec = np.array(err_vec)
    
    return err_vec

In [73]:
def criterion_s(params, *args):

    b0,b1,b2,b3 = params
    sick, W = args
    err = err_vec_s(sick,b0,b1,b2,b3)
    crit_val = err.T @ W @ err
    
    return crit_val


In [77]:
params_init = np.array([1, 0, 0, 0])
W_hat = np.eye(sick.shape[0])
gmm_args = (sick, W_hat)
results_s = opt.minimize(criterion_s, params_init, args=(gmm_args), tol=1e-14,method='L-BFGS-B')
b0, b1, b2, b3 = results_s.x

In [78]:
print("GMM Estimate of b0",b0)
print("GMM Estimate of b1",b1)
print("GMM Estimate of b2",b2)
print("GMM Estimate of b3",b3)
print('GMM criterion function:',results_s.fun)

GMM Estimate of b0 0.2516448636180223
GMM Estimate of b1 0.012933469667256711
GMM Estimate of b2 0.40050098511048643
GMM Estimate of b3 -0.009991708483433188
GMM criterion function: 0.00182128980560192
