Added hessian matrix approximation tothe estimation process / Added s…

…es to estimation output file / changed replication framework so that it provides the confidence intervals of the mte
OpenSourceEconomics · Aug 10, 2018 · 259d54b · 259d54b
1 parent 28b0664
commit 259d54b
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 21 deletions.
diff --git a/development/tests/robustness/mte_original.json b/development/tests/robustness/mte_original.json
@@ -1 +1 @@
-[0.2880329, 0.2048867, 0.1886118, 0.1782859, 0.1705181, 0.1641996, 0.1588216, 0.1541061, 0.1498839, 0.146044, 0.1425094, 0.1392244, 0.1361472, 0.1332459, 0.1304953, 0.1278752, 0.1253691, 0.1229635, 0.1206471, 0.1184101, 0.1162444, 0.1141428, 0.1120993, 0.1101084, 0.1081655, 0.1062661, 0.1044067, 0.1025839, 0.1007945, 0.0990358, 0.0973054, 0.0956009, 0.0939201, 0.0922613, 0.0906224, 0.0890019, 0.0873982, 0.0858098, 0.0842353, 0.0826733, 0.0811227, 0.0795823, 0.0780508, 0.0765272, 0.0750105, 0.0734995, 0.0719934, 0.070491, 0.0689915, 0.0674939, 0.0659972, 0.0645005, 0.0630029, 0.0615034, 0.060001, 0.0584949, 0.0569839, 0.0554672, 0.0539436, 0.0524121, 0.0508717, 0.0493211, 0.0477591, 0.0461846, 0.0445962, 0.0429925, 0.041372, 0.0397332, 0.0380743, 0.0363936, 0.034689, 0.0329586, 0.0311999, 0.0294106, 0.0275877, 0.0257283, 0.023829, 0.021886, 0.0198951, 0.0178516, 0.0157501, 0.0135843, 0.0113473, 0.0090309, 0.0066253, 0.0041192, 0.0014991, -0.0012515, -0.0041528, -0.00723, -0.010515, -0.0140496, -0.0178895, -0.0221117, -0.0268271, -0.0322052, -0.0385236, -0.0462915, -0.0566174, -0.0728923, -0.156036]
+[[0.1333301, 0.1038857, 0.0977615, 0.0937644, 0.0906855, 0.0881264, 0.0859035, 0.0839161, 0.0821023, 0.0804216, 0.0788455, 0.0773535, 0.0759299, 0.0745627, 0.0732424, 0.0719611, 0.0707125, 0.0694913, 0.0682927, 0.067113, 0.0659487, 0.0647967, 0.0636544, 0.0625193, 0.0613892, 0.0602621, 0.0591361, 0.0580095, 0.0568806, 0.055748, 0.0546101, 0.0534656, 0.0523132, 0.0511516, 0.0499796, 0.0487959, 0.0475995, 0.0463892, 0.0451639, 0.0439227, 0.0426643, 0.0413879, 0.0400923, 0.0387767, 0.03744, 0.0360814, 0.0346997, 0.0332942, 0.0318638, 0.0304076, 0.0289248, 0.0274143, 0.0258752, 0.0243065, 0.0227074, 0.0210767, 0.0194135, 0.0177166, 0.0159851, 0.0142177, 0.0124132, 0.0105704, 0.0086878, 0.006764, 0.0047975, 0.0027865, 0.0007291, -0.0013765, -0.0035326, -0.0057417, -0.0080062, -0.0103292, -0.0127139, -0.0151638, -0.0176829, -0.0202757, -0.0229473, -0.0257031, -0.0285498, -0.0314945, -0.0345456, -0.0377127, -0.041007, -0.0444414, -0.0480313, -0.0517949, -0.0557538, -0.0599347, -0.0643701, -0.0691008, -0.0741789, -0.0796721, -0.0856711, -0.0923016, -0.0997451, -0.1082783, -0.1183562, -0.1308129, -0.1474677, -0.1738934, -0.3107372], [0.2880329, 0.2048867, 0.1886118, 0.1782859, 0.1705181, 0.1641996, 0.1588216, 0.1541061, 0.1498839, 0.146044, 0.1425094, 0.1392244, 0.1361472, 0.1332459, 0.1304953, 0.1278752, 0.1253691, 0.1229635, 0.1206471, 0.1184101, 0.1162444, 0.1141428, 0.1120993, 0.1101084, 0.1081655, 0.1062661, 0.1044067, 0.1025839, 0.1007945, 0.0990358, 0.0973054, 0.0956009, 0.0939201, 0.0922613, 0.0906224, 0.0890019, 0.0873982, 0.0858098, 0.0842353, 0.0826733, 0.0811227, 0.0795823, 0.0780508, 0.0765272, 0.0750105, 0.0734995, 0.0719934, 0.070491, 0.0689915, 0.0674939, 0.0659972, 0.0645005, 0.0630029, 0.0615034, 0.060001, 0.0584949, 0.0569839, 0.0554672, 0.0539436, 0.0524121, 0.0508717, 0.0493211, 0.0477591, 0.0461846, 0.0445962, 0.0429925, 0.041372, 0.0397332, 0.0380743, 0.0363936, 0.034689, 0.0329586, 0.0311999, 0.0294106, 0.0275877, 0.0257283, 0.023829, 0.021886, 0.0198951, 0.0178516, 0.0157501, 0.0135843, 0.0113473, 0.0090309, 0.0066253, 0.0041192, 0.0014991, -0.0012515, -0.0041528, -0.00723, -0.010515, -0.0140496, -0.0178895, -0.0221117, -0.0268271, -0.0322052, -0.0385236, -0.0462915, -0.0566174, -0.0728923, -0.156036], [0.4427358, 0.3058877, 0.279462, 0.2628073, 0.2503507, 0.2402727, 0.2317396, 0.224296, 0.2176655, 0.2116665, 0.2061733, 0.2010953, 0.1963645, 0.1919291, 0.1877482, 0.1837893, 0.1800257, 0.1764358, 0.1730014, 0.1697071, 0.16654, 0.1634889, 0.1605442, 0.1576976, 0.1549417, 0.1522702, 0.1496774, 0.1471582, 0.1447083, 0.1423237, 0.1400007, 0.1377361, 0.1355271, 0.1333709, 0.1312653, 0.129208, 0.1271969, 0.1252304, 0.1233066, 0.121424, 0.1195812, 0.1177767, 0.1160093, 0.1142778, 0.112581, 0.1109177, 0.109287, 0.1076879, 0.1061193, 0.1045802, 0.1030697, 0.1015868, 0.1001306, 0.0987002, 0.0972947, 0.0959131, 0.0945544, 0.0932177, 0.0919021, 0.0906066, 0.0893301, 0.0880718, 0.0868305, 0.0856052, 0.0843949, 0.0831985, 0.0820149, 0.0808428, 0.0796812, 0.0785288, 0.0773843, 0.0762464, 0.0751138, 0.0739849, 0.0728583, 0.0717323, 0.0706052, 0.0694751, 0.06834, 0.0671977, 0.0660457, 0.0648814, 0.0637017, 0.0625032, 0.0612819, 0.0600333, 0.058752, 0.0574317, 0.0560645, 0.0546409, 0.0531489, 0.0515728, 0.0498921, 0.0480783, 0.0460909, 0.043868, 0.041309, 0.0382299, 0.0342329, 0.0281087, -0.0013348]]
diff --git a/development/tests/robustness/replication.grmpy.ini b/development/tests/robustness/replication.grmpy.ini
@@ -10,7 +10,7 @@
         start                                        auto
         agents                                       1000
         optimizer                              SCIPY-BFGS
-        maxiter                                     40000
+        maxiter                                     80000
         dependent                                    wage
         indicator                                   state
 
@@ -118,7 +118,7 @@
    SCIPY-BFGS
 
         gtol                    0.00001
-        eps                     1.4901161193847656e-07
+        eps                     1.4901161193847656e-09
 
    SCIPY-POWELL
 

diff --git a/development/tests/robustness/run.py b/development/tests/robustness/run.py
@@ -2,12 +2,10 @@
 Additionally it returns a figure of the Marginal treatment effect based on the estimation results.
 """
 import matplotlib.pyplot as plt
-from os.path import join
-from shutil import move
+from scipy.stats import norm
 import pandas as pd
 import numpy as np
 import json
-import os
 
 from grmpy.estimate.estimate_auxiliary import calculate_mte
 from grmpy.estimate.estimate import estimate
@@ -19,39 +17,81 @@ def plot_est_mte(rslt, data_frame):
 
     # Define the Quantiles and read in the original results
     quantiles = [0.0001] + np.arange(0.01, 1., 0.01).tolist() + [0.9999]
-    mte_original = json.load(open('mte_original.json', 'r'))
+    mte_ = json.load(open('mte_original.json', 'r'))
+    mte_original = mte_[1]
+    mte_original_d = mte_[0]
+    mte_original_u = mte_[2]
 
-    # Calculate the MTE
+    # Calculate the MTE and confidence intervals
     mte = calculate_mte(rslt, data_frame, quantiles)
     mte = [i / 4 for i in mte]
-
+    mte_up, mte_d = calculate_cof_int(rslt, data_frame, mte, quantiles)
     # Plot both curves
     ax = plt.figure().add_subplot(111)
 
     ax.set_ylabel(r"$B^{MTE}$")
     ax.set_xlabel("$u_S$")
-    ax.plot(quantiles, mte, label='grmpy MTE')
-    ax.plot(quantiles, mte_original, label='original MTE')
+    ax.plot(quantiles, mte, label='grmpy MTE', color='blue')
+    ax.plot(quantiles, mte_up, color='blue', linestyle=':')
+    ax.plot(quantiles, mte_d, color='blue', linestyle=':')
+    ax.plot(quantiles, mte_original, label='original MTE', color='red')
+    ax.plot(quantiles, mte_original_d, color='red', linestyle=':')
+    ax.plot(quantiles, mte_original_u, color='red', linestyle=':')
+
     ax.set_ylim([-0.4, 0.5])
 
     plt.legend()
 
     plt.tight_layout()
     plt.savefig('fig-marginal-benefit-parametric-replication.png')
+    return mte
+
+
+def calculate_cof_int(rslt, data_frame, mte, quantiles):
+    """This function calculates the confidence interval of the marginal treatment effect."""
+
+    # Import parameters and inverse hessian matrix
+    hess_inv = rslt['AUX']['hess_inv'] / data_frame.shape[0]
+    params = rslt['AUX']['x_internal']
+
+    # Distribute parameters
+    dist_cov = hess_inv[-4:, -4:]
+    param_cov = hess_inv[:46, :46]
+    dist_gradients = np.array([params[-4], params[-6], params[-2], params[-3]])
+
+    # Process data
+    covariates = [rslt['varnames'][j - 1] for j in rslt['TREATED']['order']]
+    x = np.mean(data_frame[covariates]).tolist()
+    x_neg = [-i for i in x]
+    x += x_neg
+    x = np.array(x)
+
+    # Create auxiliary parameters
+    part1 = np.dot(x, np.dot(param_cov, x))
+    part2 = np.dot(dist_gradients, np.dot(dist_cov, dist_gradients))
+
+    # Prepare two lists for storing the values
+    mte_up = []
+    mte_d = []
+
+    # Combine all auxiliary parameters and calculate the confidence intervals
+    for counter, i in enumerate(quantiles):
+        value = part2 * (norm.ppf(i)) ** 2
+        aux = np.sqrt(part1 + value) / 4
+        mte_up += [mte[counter] + norm.ppf(0.95) * aux]
+        mte_d += [mte[counter] - norm.ppf(0.95) * aux]
+    print(mte_up, mte_d)
+
+    return mte_up, mte_d
 
 
 if __name__ == '__main__':
 
-    directory = os.path.dirname(os.path.realpath(__file__))
-    target = os.path.split(os.path.split(os.path.split(directory)[0])[0])[0] + '/docs/figures'
     filename = 'fig-marginal-benefit-parametric-replication.png'
 
     # Estimate the coefficients
     rslt = estimate('replication.grmpy.ini')
 
     # Calculate and plot the marginal treatment effect
     data = pd.read_pickle('aer-replication-mock.pkl')
-    plot_est_mte(rslt, data)
-
-    # Move the plot to the documentation directory
-    move(join(directory, filename), join(target, filename))
+    mte = plot_est_mte(rslt, data)
diff --git a/docs/figures/fig-marginal-benefit-parametric-replication.png b/docs/figures/fig-marginal-benefit-parametric-replication.png
diff --git a/grmpy/estimate/estimate.py b/grmpy/estimate/estimate.py
@@ -54,8 +54,9 @@ def estimate(init_file):
         opt_rslt = minimize(
             minimizing_interface, x0, args=(dict_, data, rslt_dict), method=method, options=opts)
         rslt = adjust_output(opt_rslt, dict_, opt_rslt['x'], rslt_dict)
+
     # Print Output files
-    print_logfile(dict_, rslt)
+    print_logfile(dict_, rslt, data)
     write_comparison(dict_, data, rslt)
 
     return rslt
diff --git a/grmpy/estimate/estimate_auxiliary.py b/grmpy/estimate/estimate_auxiliary.py
@@ -1,9 +1,11 @@
 """The module provides auxiliary functions for the estimation process"""
 from statsmodels.tools.sm_exceptions import PerfectSeparationError
+from statsmodels.tools.numdiff import approx_hess3
 from scipy.stats import norm
 import statsmodels.api as sm
 import pandas as pd
 import numpy as np
+import math
 
 from grmpy.simulate.simulate_auxiliary import construct_covariance_matrix
 from grmpy.simulate.simulate_auxiliary import simulate_unobservables
@@ -184,10 +186,13 @@ def calculate_criteria(init_dict, data_frame, start_values):
     return criteria
 
 
-def print_logfile(init_dict, rslt):
+def print_logfile(init_dict, rslt, data):
     """The function writes the log file for the estimation process."""
     # Adjust output
     init_dict, rslt = adjust_print_output(init_dict, rslt)
+    rslt = calculate_se(rslt, init_dict, data)
+    auxiliary_list = process_se_log(rslt, init_dict)
+    print(auxiliary_list)
     with open('est.grmpy.info', 'w') as file_:
 
         for label in ['Optimization Information', 'Criterion Function', 'Economic Parameters']:
@@ -234,11 +239,11 @@ def print_logfile(init_dict, rslt):
 
             else:
                 file_.write(fmt.format(*['', 'Identifier', 'Start', 'Finish']) + '\n\n')
-                fmt = '  {:>10}' * 2 + ' {:>20.4f}' * 2
+                fmt = '  {:>10}' * 2 + ' {:>20.4f}' * 2 + '{:>10}'
                 for i in range(len(rslt['AUX']['x_internal'])):
                     file_.write('{0}\n'.format(
                         fmt.format('', str(i), init_dict['AUX']['starting_values'][i],
-                                   rslt['AUX']['x_internal'][i])))
+                                   rslt['AUX']['x_internal'][i], auxiliary_list[i])))
 
 
 def optimizer_options(init_dict_):
@@ -604,3 +609,78 @@ def calculate_mte(rslt, data_frame, quant=None):
         return value, args
     else:
         return value
+
+
+def calculate_se(rslt, init_dict, data_frame):
+    """This function calculates the standard errors for given parameterization via an approximation
+    of the hessian matrix."""
+
+    x0 = process_estimation_results(rslt['AUX']['x_internal'].tolist())
+    hess = approx_hess3(x0, loglikelihood_without_cholsky, args=(init_dict, data_frame),
+                        epsilon=0.000001)
+    hess_inv = np.linalg.inv(hess)
+    se = np.sqrt(np.diag(hess_inv) / data_frame.shape[0])
+    rslt['AUX']['standard_errors'] = se
+    rslt['AUX']['hess_inv'] = hess_inv
+    return rslt
+
+
+def process_estimation_results(x0):
+    """This """
+    pseudo_x0 = x0[:-6]
+    pseudo_dist = [x0[-1]] + [x0[-6]] + x0[-4:-1]
+    x0 = pseudo_x0 + pseudo_dist
+    return x0
+
+
+def loglikelihood_without_cholsky(x, init_dict, data_frame):
+    """The function provides the log-likelihood function for the calculation of the standard errors.
+    """
+    n_treated = init_dict['AUX']['num_covars_treated']
+    n_untreated = n_treated + int(init_dict['AUX']['num_covars_untreated'])
+
+    beta1 = x[:n_treated]
+    beta0 = x[n_treated:n_untreated]
+    gamma = x[n_untreated:-5]
+    sd1, sd0, sdv, rho1v, rho0v = x[-4], x[-2], x[-5], x[-3], x[-1]
+    likl = []
+    indicator = init_dict['ESTIMATION']['indicator']
+    dep = init_dict['ESTIMATION']['dependent']
+    for i in [0.0, 1.0]:
+        if i == 1.0:
+            beta, gamma, rho, sd, sdv = beta1, gamma, rho1v, sd1, sdv
+            key_ = 'TREATED'
+        else:
+            beta, gamma, rho, sd, sdv = beta0, gamma, rho0v, sd0, sdv
+            key_ = 'UNTREATED'
+        data = data_frame[data_frame[indicator] == i]
+        Z = data[[init_dict['varnames'][j - 1] for j in init_dict['CHOICE']['order']]]
+        X = data[[init_dict['varnames'][j - 1] for j in init_dict[key_]['order']]]
+
+        choice_ = pd.DataFrame.sum(gamma * Z, axis=1)
+        part1 = (data[dep] - pd.DataFrame.sum(beta * X, axis=1)) / sd
+        part2 = (choice_ - rho * sdv * part1) / (np.sqrt((1 - rho ** 2) * sdv ** 2))
+        dist_1, dist_2 = norm.pdf(part1), norm.cdf(part2)
+
+        if i == 1.0:
+            contrib = (1.0 / sd) * dist_1 * dist_2
+
+        else:
+            contrib = (1.0 / sd) * dist_1 * (1.0 - dist_2)
+
+        likl.append(contrib)
+    likl = np.append(likl[0], likl[1])
+    likl = - np.mean(np.log(np.clip(likl, 1e-20, np.inf)))
+
+    return likl
+
+
+def process_se_log(rslt, init_dict):
+    """This function processes the standard error values for the log file."""
+    se = ['------' if math.isnan(i) else str(i) for i in np.round(rslt['AUX']['standard_errors'], 4)
+          ]
+    aux = [se[-4], '------', se[-3], se[-2], se[-1], se[-5]]
+    list_ = se[:-5]
+    list_ += aux
+    list_ = ['({})'.format(i) if len(i) == 6 else '({}0)'.format(i) for i in list_]
+    return list_