In [None]:
%load_ext autoreload
%autoreload 2

from simulation_code import *

pd.set_option('display.max_rows', 500)
np.random.seed(42)


# basic illustration of how simulated data works

In [None]:
params = {'beta_XT':[0, 1, 0.4], 
          'beta_XD1':[-0.01, -2, -0.01], 
          'beta_D0':None,
          'intercept_D0':-0.1,
         'N':100000}

ground_truth = generate_simulated_data_without_using_Z(**params)
estimators = fit_estimators(X=ground_truth['X'], 
               T=ground_truth['T'], 
               D=ground_truth['D'])
print(estimators.agg(['mean', 'std']))

estimators['ground truth p(D=1|X)'] = ground_truth['ground truth p(D=1|X)']
estimators.corr(method='pearson')
        
    

# demonstrate that, across random draws of coefficients and large data, p(D=1|T=1, X) generally outperforms p(T=1|X)

In [None]:
# run a lot of trials with random parameters. 
all_results = []
for i in tqdm(range(5000)):
    M = 3
    params = {"beta_XT":np.random.randn(M,), # M-dimensional data. 
              "beta_XD1":np.random.randn(M,), 
              "intercept_D0":np.random.random() * -0.1,
              "beta_D0":np.random.randn(M,),
              'N':10000, 
              'make_plot':False}
    all_results.append({'params':params, 'results':fit_param_set_and_print_results(params, verbose=False)})
all_results = pd.DataFrame(all_results) # compiled dataframe of trials, each row is one trial. 
print("Across all trials, comparing estimator p_hat_D to p_hat_T")
all_results['difference_in_correlations'] = all_results['results'].map(difference_in_correlations)
all_results['D_better_correlated'] = all_results['results'].map(D_better_correlated)
print(all_results[['difference_in_correlations', 'D_better_correlated']].mean())

In [None]:

plt.figure(figsize=[4, 4])
plt.hist(all_results['results'].map(lambda x:x['pearson_corr']['p(D=1|X)']['p_hat(T=1|X)']),
         bins=np.linspace(-1, 1, 20), alpha=0.8, label='$\hat p_T$')
plt.hist(all_results['results'].map(lambda x:x['pearson_corr']['p(D=1|X)']['p_hat(D=1|T=1, X)']),
         bins=np.linspace(-1, 1, 20), alpha=0.5, label=r'$\hat p_{Y_{T=1}}$')
plt.legend(fontsize=14)
plt.xlim([-1, 1])
plt.xlabel("Correlation with $p_Y$", fontsize=14)
plt.ylabel("Number of simulations", fontsize=14)
plt.tight_layout()
plt.savefig('p_D_better_correlated_across_simulations.pdf')


# exceptions to p(D=1|T=1, X) being better. (This are just single examples - we do the more systematic simulation further down). 

1. Very little variance in p(D=1|T=1, X) (relative to the variance in alpha * p(T=1|X)
2. Poor estimation of p(D=1|T=1, X) (due to small data e.g.)



### 1. very little variance in p(D=1|T=1, X) relative to p(T=1|X) * alpha. 

verified that, for large datasets, crossing point occurs where we expect it to. Make data large so p(D=1|T=1, X) is well-estimated. 

In [None]:
params = {'beta_XT':[0, 1, 2], 
          'beta_XD1':[0, 0.1, -0.1], # small coefficients so little variance
          'beta_D0':None,
          'intercept_D0':-0.3,
         'N':100000}

results = fit_param_set_and_print_results(params)

### 2. Very small dataset for estimating p(D=1|T=1, X). This produces inconsistent correlations for p(D=1|T=1, X) - sometimes they happen to be good, but often they're not. 


In [None]:
params = {'beta_XT':[-2] + list(0.5 * np.random.randn(10,)), 
          'beta_XD1':[1] + list(0.1 * np.random.randn(10,)), 
          'beta_D0':None,
          'intercept_D0':-0.5,
         'N':500}

results = fit_param_set_and_print_results(params)

# run systematic simulation of both phenomena. 

# what about non-constant unobservables

### 1. Vary the variance in p(D=1|T=1, X) by varying beta_XD1 (proportional unobservables). 

In [None]:
N_TRIALS_PER_SETTING = 50
default_param_set = {'beta_XT':[0, 1, 2], 
          'beta_D0':[1, 0, 0],
          'intercept_D0':0, 
         'N':10000, 
        'make_plot':False}
simulation_results = run_simulations_varying_parameter(default_param_set=default_param_set, 
                                param_to_vary='beta_XD1',
                                param_vals=[[0, beta_i, -beta_i] for beta_i in 10 ** np.linspace(-2, 0, 20)],
                                n_trials_per_setting=N_TRIALS_PER_SETTING, 
                                x_axis_names=['$\sigma_{p_{Y_{T=1}}}/\sigma_{p_T}$', '$\sigma_{p_{Y_{T=1}}}/\sigma_{p_{T0}*u(X)}$'],
                                x_axis_fxns=[sd_p_d_over_sd_p_t, sd_p_d_over_sd_p_t0_times_u],
                                x_crossover_points=None,# [1, 0.2] 
                                y_axis_names=[CORRELATION_DIFF_LATEX_NAME, 'Pr(D wins)'],
                                y_axis_fxns=[difference_in_correlations, D_better_correlated], 
                                quantities_to_print_but_not_plot_names=['Corr(D_estimate, D_ground_truth)', 'Corr(T_estimate, T_ground_truth)', 'p_D_T0_out_of_range_frac'], 
                                quantities_to_print_but_not_plot_fxns=[corr_p_d_estimate_with_p_d_ground_truth, corr_p_t_estimate_with_p_t_ground_truth, p_D_T0_out_of_range_frac], 
                                plot_filename_string='simulation_vary_variance_in_p_d.pdf')
    

### 2. vary accuracy of estimating p(D=1|X) by altering sample size (proportional unobservables). 

In [None]:
# one issue with this default_param_set is that because 
# we just randomly generate 25 coefficients, results can vary a bit from run to run. 
# we set a seed to deal with this. An alternative would be to set this by hand but that feels 
# somewhat hacky. The reason we do 25 coefficients is to make it harder to estimate p(D=1|X).
# the negative intercept on beta_XT means that the mean of p(T) is low. 
np.random.seed(43)
default_param_set = {'beta_XT':[-2] + list(0.5 * np.random.randn(25,)), 
          'beta_XD1':[1] + list(0.1 * np.random.randn(25,)), 
          'beta_D0':[1] + [0 for i in range(25)],
          'intercept_D0':0,
          'make_plot':False}
simulation_results = run_simulations_varying_parameter(default_param_set=default_param_set, 
                                param_to_vary='N',
                                param_vals=[500, 600, 700, 800, 900, 1000, 1500, 2000, 3000, 5000, 10000],
                                n_trials_per_setting=N_TRIALS_PER_SETTING, 
                                x_axis_names=['$corr(\hat p_{Y_{T=1}}, p_{Y_{T=1}})$'],
                                x_axis_fxns=[corr_p_d_estimate_with_p_d_ground_truth],
                                x_crossover_points=None,
                                y_axis_names=[CORRELATION_DIFF_LATEX_NAME, 'Pr(D wins)'],
                                y_axis_fxns=[difference_in_correlations, D_better_correlated], 
                                quantities_to_print_but_not_plot_names=['sd_p_D/sd_p_T', 'sd_p_D/sd(p(T=0)*u)', 'Corr(T_estimate, T_ground_truth)', 'p_D_T0_out_of_range_frac'], 
                                quantities_to_print_but_not_plot_fxns=[sd_p_d_over_sd_p_t, sd_p_d_over_sd_p_t0_times_u, corr_p_t_estimate_with_p_t_ground_truth, p_D_T0_out_of_range_frac], 
                                plot_filename_string='simulation_vary_estimation_quality_of_p_d.pdf')

# Repeat analyses above using constant unobesrvables. These do not make it into the final paper. 

### 1. Vary the variance in p(D=1|T=1, X) by varying beta_XD1 (constant unobservables). 

In [None]:
default_param_set = {'beta_XT':[0, 1, 2], 
          'beta_D0':None,
          'intercept_D0':-0.2, # constant unobservables. 
         'N':10000, 
        'make_plot':False}
simulation_results = run_simulations_varying_parameter(default_param_set=default_param_set, 
                                param_to_vary='beta_XD1',
                                param_vals=[[0, beta_i, -beta_i] for beta_i in 10 ** np.linspace(-1, 0, 20)],
                                n_trials_per_setting=N_TRIALS_PER_SETTING, 
                                x_axis_names=['$\sigma_{p_{Y_{T=1}}}/\sigma_{p_T}$', '$\sigma_{p_{Y_{T=1}}}/\sigma_{p_{T0}*u(X)}$'],
                                x_axis_fxns=[sd_p_d_over_sd_p_t, sd_p_d_over_sd_p_t0_times_u],
                                x_crossover_points=None,# [1, 0.2] 
                                y_axis_names=[CORRELATION_DIFF_LATEX_NAME, 'Pr(D wins)'],
                                y_axis_fxns=[difference_in_correlations, D_better_correlated], 
                                quantities_to_print_but_not_plot_names=['Corr(D_estimate, D_ground_truth)', 'Corr(T_estimate, T_ground_truth)', 'p_D_T0_out_of_range_frac'], 
                                quantities_to_print_but_not_plot_fxns=[corr_p_d_estimate_with_p_d_ground_truth, corr_p_t_estimate_with_p_t_ground_truth, p_D_T0_out_of_range_frac], 
                                plot_filename_string='simulation_vary_variance_in_p_d_CONSTANT_UNOBSERVABLES.pdf')
    

### 2. vary accuracy of estimating p(D=1|X) by altering sample size (constant unobservables). 

In [None]:
# one issue with this default_param_set is that because 
# we just randomly generate 25 coefficients, results can vary a bit from run to run. 
# we set a seed to deal with this. An alternative would be to set this by hand but that feels 
# somewhat hacky. The reason we do 25 coefficients is to make it harder to estimate p(D=1|X).
# the negative intercept on beta_XT means that the mean of p(T) is low. 
np.random.seed(43)
default_param_set = {'beta_XT':[-2] + list(0.5 * np.random.randn(25,)), 
          'beta_XD1':[1] + list(0.1 * np.random.randn(25,)), 
          'beta_D0':None,
          'intercept_D0':-0.2,
          'make_plot':False}
simulation_results = run_simulations_varying_parameter(default_param_set=default_param_set, 
                                param_to_vary='N',
                                param_vals=[500, 600, 700, 800, 900, 1000, 1500, 2000, 3000, 5000, 10000],
                                n_trials_per_setting=N_TRIALS_PER_SETTING, 
                                x_axis_names=['$corr(\hat p_{Y_{T=1}}, p_{Y_{T=1}})$'],
                                x_axis_fxns=[corr_p_d_estimate_with_p_d_ground_truth],
                                x_crossover_points=None,
                                y_axis_names=[CORRELATION_DIFF_LATEX_NAME, 'Pr(D wins)'],
                                y_axis_fxns=[difference_in_correlations, D_better_correlated], 
                                quantities_to_print_but_not_plot_names=['sd_p_D/sd_p_T', 'sd_p_D/sd(p(T=0)*u)', 'Corr(T_estimate, T_ground_truth)', 'p_D_T0_out_of_range_frac'], 
                                quantities_to_print_but_not_plot_fxns=[sd_p_d_over_sd_p_t, sd_p_d_over_sd_p_t0_times_u, corr_p_t_estimate_with_p_t_ground_truth, p_D_T0_out_of_range_frac], 
                                plot_filename_string='simulation_vary_estimation_quality_of_p_d_CONSTANT_UNOBSERVABLES.pdf')