-
Notifications
You must be signed in to change notification settings - Fork 27
/
ensemble_boot.py
291 lines (232 loc) · 8.9 KB
/
ensemble_boot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""Provides analysis element for ensemble bootstrapping analysis.
"""
import numpy as np
import pandas as pd
from easyvvuq import OutputType
from .base import BaseAnalysisElement
__copyright__ = """
Copyright 2018 Robin A. Richardson, David W. Wright
This file is part of EasyVVUQ
EasyVVUQ is free software: you can redistribute it and/or modify
it under the terms of the Lesser GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EasyVVUQ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Lesser GNU General Public License for more details.
You should have received a copy of the Lesser GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__license__ = "LGPL"
def confidence_interval(dist, value, alpha, pivotal=False):
"""
Get the bootstrap confidence interval for a given distribution.
Parameters
----------
dist:
Array containing distribution of bootstrap results.
value:
Value of statistic for which we are calculating error bars.
alpha:
The alpha value for the confidence intervals.
pivotal:
Use the pivotal method? Default to percentile method.
Returns
-------
float:
Value of the bootstrap statistic
float:
Highest value of the confidence interval
float:
Lowest value of the confidence interval
"""
if len(dist) < 1:
raise ValueError("Dist array should be non-empty")
if pivotal:
low = 2 * value - np.percentile(dist, 100 * (1 - alpha / 2.), axis=0)
stat = value
high = 2 * value - np.percentile(dist, 100 * (alpha / 2.), axis=0)
else:
low = np.percentile(dist, 100 * (alpha / 2.), axis=0)
stat = np.percentile(dist, 50)
high = np.percentile(dist, 100 * (1 - alpha / 2.), axis=0)
# if low > high:
# (low, high) = (high, low)
return stat, low, high
def bootstrap(data, stat_func, alpha=0.05,
sample_size=None, n_samples=1000,
pivotal=False):
"""
Parameters
----------
data : :obj:`pandas.DataFrame`
Input data to be analysed.
stat_func : function
Statistical function to be applied to data for bootstrapping.
alpha : float
Produce estimate of 100.0*(1-`alpha`) confidence interval.
sample_size : int
Size of the sample to be drawn from the input data.
n_samples : int
Number of times samples are to be drawn from the input data.
pivotal : bool
Use the pivotal method? Default to percentile method.
Returns
-------
float:
Value of the bootstrap statistic
float:
Highest value of the confidence interval
float:
Lowest value of the confidence interval
"""
if data.empty:
raise RuntimeError("DataFrame passed to bootstrap has to be non-empty")
stat = data.apply(stat_func)
if sample_size is None:
sample_size = len(data)
dist = []
for l in range(n_samples):
sample = data.sample(sample_size, replace=True)
dist.append(stat_func(sample))
return confidence_interval(dist, stat, alpha, pivotal=pivotal)
def ensemble_bootstrap(data, groupby=[], qoi_cols=[],
stat_func=np.mean, alpha=0.05,
sample_size=None, n_samples=1000,
pivotal=False, stat_name='boot'):
"""
Perform bootstrapping analysis on input data.
Parameters
----------
data : :obj:`pandas.DataFrame`
Date to be analysed.
groupby : list or None
Columns to use to group the data in `analyse` method before
calculating stats.
qoi_cols : list or None
Columns of quantities of interest (for which stats will be
calculated).
stat_func : function
Statistical function to be applied to data for bootstrapping.
alpha : float, default=0.05
Produce estimate of 100.0*(1-`alpha`) confidence interval.
sample_size : int
Size of the sample to be drawn from the input data.
n_samples : int, default=1000
Number of times samples are to be drawn from the input data.
pivotal : bool, default=False
Use the pivotal method? Default to percentile method.
stat_name : str, default='boot'
Name to use to describe columns containing output statistic (for example
'mean').
Returns
-------
:obj:`pandas.DataFrame`
Description of input data using bootstrap statistic and high/low
confidence intervals.
"""
agg_funcs = {}
if not qoi_cols:
qoi_cols = [
x for x in data.columns if x not in groupby + ['run_id', 'status']]
for col in qoi_cols:
if col not in data:
raise RuntimeError(f"No such attribute: {col}\nAttributes found in data: {data}")
agg_funcs[col] = lambda x: bootstrap(
x,
stat_func=stat_func,
alpha=alpha,
sample_size=sample_size,
n_samples=n_samples,
pivotal=pivotal)
if not groupby:
grouped_data = data.groupby(lambda x: True, sort=False)
else:
grouped_data = data.groupby(groupby, sort=False)
# Apply bootstrapping to all value columns selected
# Note results come a tuple per cell
results = grouped_data.agg(agg_funcs)
outputs = [stat_name, 'low', 'high']
# Split out tuples in each cell and provide sensible naming
results = pd.concat({col: results[col].apply(
lambda cell: pd.Series(cell, index=outputs)
)
for col in qoi_cols}, axis=1)
return results
class EnsembleBoot(BaseAnalysisElement):
def __init__(self, groupby=[], qoi_cols=[],
stat_func=np.mean, alpha=0.05,
sample_size=None, n_boot_samples=1000,
pivotal=False, stat_name='boot'):
"""
Element to perform bootstrapping on collated simulation output.
Parameters
----------
groupby : list or None
Columns to use to group the data in `analyse` method before
calculating stats.
qoi_cols : list or None
Columns of quantities of interest (for which stats will be
calculated).
stat_func : function
Statistical function to be applied to data for bootstrapping.
alpha : float, default=0.05
Produce estimate of 100.0*(1-`alpha`) confidence interval.
sample_size : int
Size of the sample to be drawn from the input data.
n_boot_samples : int, default=1000
Number of times samples are to be drawn from the input data.
pivotal : bool, default=False
Use the pivotal method? Default to percentile method.
stat_name : str, default='boot'
Name to use to describe columns containing output statistic (for example
'mean').
"""
self.groupby = groupby
self.qoi_cols = qoi_cols
self.stat_func = stat_func
self.alpha = alpha
self.sample_size = sample_size
self.n_boot_samples = n_boot_samples
self.pivotal = pivotal
self.stat_name = stat_name
self.output_type = OutputType.SUMMARY
if self.stat_func is None:
raise ValueError('stat_func cannot be None.')
def element_name(self):
"""Name for this element for logging purposes"""
return "ensemble_boot"
def element_version(self):
"""Version of this element for logging purposes"""
return "0.1"
def analyse(self, data_frame=None):
"""Perform bootstrapping analysis on the input `data_frame`.
The data_frame is grouped according to `self.groupby` if specified and
analysis is performed on the columns selected in `self.qoi_cols` if set.
Parameters
----------
data_frame : :obj:`pandas.DataFrame`
Summary data produced through collation of simulation output.
Returns
-------
:obj:`pandas.DataFrame`
Basic statistic for selected columns and groupings of data.
"""
if data_frame is None:
raise RuntimeError(
"This VVUQ element needs a data frame to analyse")
elif data_frame.empty:
raise RuntimeError(
"No data in data frame passed to analyse element")
results = ensemble_bootstrap(
data_frame,
groupby=self.groupby,
qoi_cols=self.qoi_cols,
stat_func=self.stat_func,
alpha=self.alpha,
sample_size=self.sample_size,
n_samples=self.n_boot_samples,
pivotal=self.pivotal,
stat_name=self.stat_name)
return results