-
Notifications
You must be signed in to change notification settings - Fork 1
/
RunSynthetic.py
213 lines (166 loc) · 8.35 KB
/
RunSynthetic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
from glob import glob
import re
import os
from copy import deepcopy
from tqdm import tqdm
from causallib.datasets import load_acic16
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import matplotlib.pyplot as plt
import utils
def get_expr_id(s, r_pattern):
"""
get the id of experiment from the list
Args:
s: file name
r_pattern: regex pattern to find
Returns:
the id of the acic experiment
"""
return re.findall(r_pattern, s)[0]
def get_data_files(reg):
"""
Retrieves the data files of the acic experiments
Args:
reg:
Returns:
"""
cur_dir = os.path.abspath(os.getcwd())
data_dir = os.path.join(cur_dir, "data")
pattern = 'var*_*.csv'
target_files_list = glob(os.path.join(data_dir, pattern))
rel_target_list = sorted([f for f in target_files_list if get_expr_id(f, reg).split('_')[0] == '42'])
return rel_target_list
def run_multiple_experiments(experiment_files, model_experiments, reg, x_acic):
"""
Runs the experiments, as detailed in the paper for the synthetic data
Args:
experiment_files: the experiments files, with the acic data
model_experiments: the statistical models to be used
reg: the regularization pattern to extract the data number
x_acic: the covarites of the ACIC data
Returns:
Dataframe with results, one row per model, per experiment
"""
df_list = []
for instance in tqdm(experiment_files):
models = deepcopy(model_experiments)
id_ = get_expr_id(instance, reg)
instance_df = pd.read_csv(instance, index_col=[0]).reset_index(drop=True)
po = instance_df[['y.0', 'y.1']].copy()
po.columns = [0, 1]
e = instance_df['e'].copy()
c_df_1 = utils.run_synthetic_experiments(var=x_acic,
t=instance_df['z'],
e=e,
potential_outcomes=po,
experiments=models,
post_colab_func=utils.sigmoid_calib,
save=True, save_name=f'models_test_new/acic{id_}_new_sig.pkl', id_=id_,
nested_cv=False
)
ate = (po[1] - po[0]).mean()
c_df_1['ATE_error'] = (c_df_1['ATE'] - ate).pipe(lambda x: np.sqrt(x ** 2))
df_list.append(c_df_1)
df_res = pd.concat(df_list)
df_res.to_csv('nested_sig_with_lr.csv')
return df_res
def get_res_dict():
"""
get result dict of the experiments
Returns:
dict of the results
"""
import pickle
with open("models_test/acic42_2_new_sig.pkl", 'rb') as f:
res_dict = pickle.load(f)
return res_dict
def make_calibration_graph_synthetic(res_dict, cur_run_dir):
"""
Create the calibration graphs from the paper
Args:
res_dict: the results dict
cur_run_dir: the folder to save the figs
Returns:
"""
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
utils.plot_calibration_curve(res_dict, 'rf_cv_model', ax1=axes[0][0], model_name="Random Forest",
key_order=("Ground truth", "uncalibrated", "calibrated"))
utils.plot_calibration_curve(res_dict, 'GBT_cv_model', ax1=axes[0][1], model_name="Gradient Boosting Trees",
key_order=("Ground truth", "uncalibrated", "calibrated"))
utils.plot_calibration_curve(res_dict, 'lr_model', ax1=axes[0][2], model_name="Logistic Regression",
key_order=("Ground truth", "uncalibrated", "calibrated"))
utils.plot_calibration_curve(res_dict, 'lr_l1_model', ax1=axes[1][0], model_name="Lasso Logistic Regression",
key_order=("Ground truth", "uncalibrated", "calibrated"))
utils.plot_calibration_curve(res_dict, 'lr_l2_model', ax1=axes[1][1], model_name="Ridge Logistic Regression",
key_order=("Ground truth", "uncalibrated", "calibrated"))
fig.supxlabel('Predicted probability', fontweight="bold", fontsize=30)
fig.supylabel('Actual probability', fontweight="bold", fontsize=30, x=0.01)
fig.suptitle("Calibration curves of statistical estimators (ACIC)", fontweight="bold", fontsize=25)
fig.legend(*axes[0][1].get_legend_handles_labels(), loc=(0.73, 0.2), prop={'weight': 'bold', 'size': 18})
for x in range(2):
for y in range(3):
if x == 1 and y == 2:
continue
axes[x][y].get_legend().remove()
axes[1][2].remove()
plt.tight_layout()
utils.save_figure_in_format(figure=fig, save_dir=cur_run_dir, filename='calibration_acic')
def make_graphs_for_synthetic(df, force_names, cur_run_dir):
"""
make graphs for synthetic data, as shown in paper
Args:
df: dataframe of models per experiment
force_names: names of the model to be shown
cur_run_dir: the dir to save graphs
Returns:
"""
utils.plot_comp_plot(df.query("scale != 'identity_model'"), x_label=r"Calibration error",
plot_legend=True, force_names=force_names, color_edges=True)
# plt.
plt.tight_layout()
plt.savefig(os.path.join(cur_run_dir, 'acic_models_calibration_ate.jpg'), dpi=400)
utils.plot_comp_plot(df.query("scale != 'identity_model'"), y_metric='Balancing',
y_label='Balancing', x_label=r"Calibration error",
plot_legend=True, force_names=force_names, color_edges=True)
plt.tight_layout()
plt.savefig(os.path.join(cur_run_dir, 'acic_models_calibration_balancing.jpg'), dpi=400)
utils.plot_comp_plot(df.query("scale != 'identity_model'"), metric='Balancing', x_label='Balancing error',
plot_legend=True, force_names=force_names, color_edges=True)
plt.tight_layout()
plt.savefig(os.path.join(cur_run_dir, 'acis_models_balancing_ate.jpg'), dpi=400)
if __name__ == '__main__':
regex_term = r'(?<=var)\d{1,2}_\d{1,2}'
rel_target_list = get_data_files(reg=regex_term)
X = load_acic16(instance=1, raw=False)['X']
rf_tuned_parameters = [{'max_depth': [5, 10, 20, 30],
'n_estimators': [50, 100, 200, 400, 1000]}]
gb_tuned_parameters = [{'max_depth': [1, 2, 3, 6],
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators': [30, 50, 100, 250, 500, 1000]}]
scores = 'neg_brier_score'
cv_inner = KFold(n_splits=10, shuffle=True, random_state=42)
model_experiments = {
'lr': LogisticRegression(random_state=42, n_jobs=-1, penalty='none'),
'lr_l1': LogisticRegressionCV(random_state=42, n_jobs=-1, cv=10, solver='saga', penalty='l1', max_iter=1e4),
'lr_l2': LogisticRegressionCV(random_state=42, n_jobs=-1, cv=10, solver='saga', penalty='l2', max_iter=1e4),
'GBT_cv': GridSearchCV(GradientBoostingClassifier(random_state=42), gb_tuned_parameters, scoring=scores,
n_jobs=-1, cv=cv_inner),
'rf_cv': GridSearchCV(RandomForestClassifier(random_state=42, oob_score=True), rf_tuned_parameters,
scoring=scores, n_jobs=-1, cv=cv_inner),
'identity': None
}
df_42_nested_sig_new = run_multiple_experiments(rel_target_list, model_experiments, reg=regex_term, x_acic=X)
print(utils.get_slopes(df_42_nested_sig_new.query("scale != 'identity_model'")))
print(utils.get_slopes(df_42_nested_sig_new.query("scale != 'identity_model'"),
x_metric='mean', y_metric='Balancing'))
print(utils.get_slopes(df_42_nested_sig_new.query("scale != 'identity_model'"), x_metric='Balancing'))
run_dir = utils.make_run_dir("sig_nested_42_lr")
res_dict = get_res_dict()
make_calibration_graph_synthetic(res_dict, cur_run_dir=run_dir)
model_names = ['Logistic Regression', 'Lasso Logistic Regression', 'Ridge Logistic Regression',
"Gradient Boosting Trees", "Random Forest"]
make_graphs_for_synthetic(df=df_42_nested_sig_new, force_names=model_names, cur_run_dir=run_dir)