In [1]:
%load_ext autoreload
%autoreload 2

# Combining Results in ORD

This notebook provides a proof of concept example for combining catastrophe
loss model results in the Open Results Data (ORD) format. We follow the
methodology outlined in *Combining_results_in_ORD_v1.1.pdf*.

This notebook is split into the workflow sequence as follows:

1. Load and Group
2. Period Sampling
3. Loss Sampling
4. Output Preparation

In [2]:
# imports
from datetime import datetime
from pathlib import Path
import json
from dataclasses import asdict
import pandas as pd

In [3]:
# make sure relative imports work
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

The input files are multiple runs of PiWind.

In [4]:
parent_path = Path().absolute().parent / 'piwind-ord'
# parent_path = Path().absolute() / 'piwind-ord'

ord_output_dirs = [parent_path / "split/1/runs/losses-20251201164501/output/",
                   parent_path / "split/2/runs/losses-20251201164618/output/"]

In [5]:
# specify directory for outputs

output_dir = Path("./combined_ord-" + datetime.now().strftime("%d%m%y%H%M%S"))
output_dir.mkdir(exist_ok=True)
print(f'Output Path: {output_dir}')

Output Path: combined_ord-091225153902


## 1. Load and Group
### Creating Analysis and OutputSet
In this section we create the objects required prior to grouping, namely:
- Analysis table which contains the meta data from the analyses
- OutputSet table which contains references to the ORD results.


The `analysis_settings.json` files for each ORD analysis are parsed to read the Analysis and OutputSet tables.

In [6]:
from ord_combining.outputset import load_analysis_and_outputsets
from ord_combining.common import dataclass_list_to_dataframe

analysis, outputsets = load_analysis_and_outputsets(ord_output_dirs)

# Convert to dict / df for remainder of notebook
analysis = {a.id: a for a in analysis}
outputsets_df = dataclass_list_to_dataframe(outputsets)

outputsets_df['id'] = outputsets_df.index  # set id col

In [7]:
outputsets_df

Unnamed: 0,id,perspective_code,analysis_id,exposure_summary_level_fields,exposure_summary_level_id
0,0,gul,1,[],1
1,1,gul,1,[LocNumber],2
2,2,gul,2,[],1
3,3,gul,2,[LocNumber],2


In [8]:
outputsets_df.columns

Index(['id', 'perspective_code', 'analysis_id',
       'exposure_summary_level_fields', 'exposure_summary_level_id'],
      dtype='object')

### Creating GroupEventSet
The GroupEventSet are used to define common events, thereby allowing for a
list of consistent unique events that can be used to create GroupPeriods.

There is a config option `group_event_set_fields` which specifies which fields to use to specify the unique event.

The EventOccurenceSet table contains the meta information for each event set based on the `group_event_set_fields`.

In [9]:
from ord_combining.groupeventset import generate_group_set, generate_group_event_set
group_event_set_fields = ['event_set_id', 'event_occurrence_id', 'model_supplier_id']

group_set, group_output_set = generate_group_set(outputsets_df)
event_occurrence_set_df, event_occurrence_set_analysis = generate_group_event_set(analysis, group_event_set_fields)

Full event occurrence set:
   event_set_id event_occurrence_id model_supplier_id  analysis_id
0            p                  lt          OasisLMF            1
1            p                  lt          OasisLMF            2
Event occurrence set:
    event_occurrence_set_id event_set_id event_occurrence_id model_supplier_id
0                        1            p                  lt          OasisLMF


In [10]:
group_output_set

{0: 0, 1: 1, 2: 0, 3: 1}

In [11]:
group_set

Unnamed: 0_level_0,group_id,perspective_code,exposure_summary_level_fields_string
group_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,gul,
1,1,gul,LocNumber


In [12]:
event_occurrence_set_df

Unnamed: 0,event_occurrence_set_id,event_set_id,event_occurrence_id,model_supplier_id
0,1,p,lt,OasisLMF


In [13]:
event_occurrence_set_analysis

Unnamed: 0,analysis_id,event_occurrence_set_id
0,1,1
1,2,1


Once the groups have been assigned the SummaryId is aligned within each group_set.
To do so we find each unique grouping of summary level fields in each group set and aggregate the tiv by summing.
Then we produce a `outputset_summary_id_map` which contains dicts which maps
the summary_id of the ORD files to the group `SummaryId` indexed by a key
value of `output_set_id`.
Note only adds mapping where summary_id != SummaryId

To demo this swapped LocNumber for summary_id 1 and 2 in /home/vinulw/code/ODS_Tools/ord_combining/losses-20251021131718 SummaryLevel 2

In [14]:
from ord_combining.summaryinfo import load_summary_info, assign_summary_ids, generate_summary_id_map
os_summary_info = load_summary_info(analysis, outputsets_df)
group_set_summary_info = assign_summary_ids(group_output_set, os_summary_info)

In [15]:

outputset_summary_id_map = generate_summary_id_map(os_summary_info, group_set_summary_info, group_output_set)

outputset_summary_id_map

{1: {2: 3, 3: 6, 4: 9, 5: 10}, 3: {1: 2, 2: 4, 3: 5, 4: 7, 5: 8}}

In [16]:
# save outputs
with open(output_dir / 'analysis.json', 'w') as f:
    _analysis_dict = {key: asdict(value) for key, value in analysis.items()}
    json.dump(_analysis_dict, f, indent=4)

with open(output_dir / 'group_output_set.json', 'w') as f:
    json.dump(group_output_set, f, indent=4)

group_set.to_csv(output_dir / 'group_set.csv')
event_occurrence_set_analysis.to_csv(output_dir / 'group_event_set_analysis.csv', index=False)
event_occurrence_set_df.to_csv(output_dir / 'event_occurrence_set.csv', index=False)

outputsets_df.to_csv(output_dir / 'output_set.csv', index=False)

# Serialise summary-info
for gs, g_summary_info_df in group_set_summary_info.items():
    gs_info = group_set.loc[gs]
    summary_info_fname = f'{gs_info['perspective_code']}_GS{gs}_summary-info.csv'
    g_summary_info_df.to_csv(output_dir / summary_info_fname, index=False)

## 2. Period Sampling
Now that each analysis has been grouped, we need to generate the GroupPeriods
into which the events are assigned to for the combined output.

We extract the Period for a given GroupEventSet that has a loss causing event
and the total number of Periods. These Periods are then assigned to the
GroupPeriod randomly, and if the total number of GroupPeriods is larger than
the total number of Period then the GroupEventSet periods are cycled.

The period information can be extracted from the header info of the `occurrence.bin` file.

In [17]:
from ord_combining.groupperiod import generate_group_periods

total_group_periods = 10000  # config: set by user

In [18]:
group_event_set_analysis = event_occurrence_set_analysis.rename(columns={'event_occurrence_set_id': 'group_event_set_id'})

group_period = generate_group_periods(group_event_set_analysis, analysis, total_group_periods)

group_period.head()

print('No. of group periods: ', len(group_period))

No. of group periods:  7620


  return d[key]


In [19]:
# save csv
group_period.to_csv(output_dir / 'group_period.csv', index=False)

## 3. Loss Sampling
The final step involves sampling losses for each event in the GroupPeriod.
There are two types of loss sampling:
- Mean only (only for MELT files)
- Full uncertainty sampling

The additional config options are demonstrated below. An example of a full config is:

```python
loss_sampling_config = {
    "group_mean": False, # mean only
    "group_mean_type": 1,  # SampleType filter
    "group_secondary_uncertainty": False,
    "group_parametric_distribution": 'gamma',  # either gamma or beta
    "group_format_priority": ["m", "q", "s"}
}
```

So far only `q` and `s` loss sampling are implemented. We output both mean only and full secondary uncertainty sampling below.

In [20]:
group_format_priority = ['s']

The first stage in loss sampling is generating the GroupPeriodQuantile table.

In [21]:
from ord_combining.losssampling import construct_gpqt

gpqt = construct_gpqt(group_period, group_event_set_analysis, outputsets_df, analysis)

Currently processing group_event_set_id: 1,  outputset: 0
Currently processing group_event_set_id: 1,  outputset: 1
Currently processing group_event_set_id: 1,  outputset: 2
Currently processing group_event_set_id: 1,  outputset: 3


In [22]:
# save gpqt
gpqt.to_csv(output_dir / "gpqt.csv", index=False)

Finally the loss sampling can be done to produce the group period loss table (GPLT).

In [23]:
from ord_combining.losssampling import do_loss_sampling_full_uncertainty, do_loss_sampling_mean_only

In [24]:
# secondary uncertainty sampling
gplt_full = do_loss_sampling_full_uncertainty(gpqt, outputsets_df,
                                              group_output_set, analysis,
                                              priority=group_format_priority,
                                              outputset_summary_id_map=outputset_summary_id_map,
                                              output_dir=output_dir)

gplt_full.head()

Running output_set_id: 0 - 1/4
Could not perform loss sampling for 10700 events.
Saved missing gpqt files to: combined_ord-091225153902/missing_gpqt_0.csv
Running output_set_id: 1 - 2/4
Could not perform loss sampling for 10700 events.
Saved missing gpqt files to: combined_ord-091225153902/missing_gpqt_1.csv
Running output_set_id: 2 - 3/4
Could not perform loss sampling for 10700 events.
Saved missing gpqt files to: combined_ord-091225153902/missing_gpqt_2.csv
Running output_set_id: 3 - 4/4
Could not perform loss sampling for 10700 events.
Saved missing gpqt files to: combined_ord-091225153902/missing_gpqt_3.csv


Unnamed: 0,group_set_id,output_set_id,SummaryId,GroupPeriod,Period,group_event_set_id,EventId,Loss,LossType
0,0,0,1,2,756,1,1094,116044.532845,2
1,0,0,1,2,756,1,1095,134083.190758,2
2,0,0,1,10,642,1,919,11385.682784,2
3,0,0,1,12,319,1,447,95285.146687,2
4,0,0,1,15,456,1,655,29433.598923,2


In [25]:
# mean only sampling
gplt_mean = do_loss_sampling_mean_only(gpqt, outputsets_df, group_output_set, analysis,
                                       outputset_summary_id_map=outputset_summary_id_map)

gplt_mean.head()

Output set 1 has 10700 missing SummaryIds.
Output set 3 has 10700 missing SummaryIds.


Unnamed: 0,group_set_id,output_set_id,SummaryId,GroupPeriod,Period,group_event_set_id,EventId,Loss,LossType
0,0,0,,1,677,1,975,,
1,0,0,1.0,2,756,1,1094,198404.0,1.0
2,0,0,1.0,2,756,1,1094,167263.875,3.0
3,0,0,1.0,2,756,1,1095,99202.0,1.0
4,0,0,1.0,2,756,1,1095,62802.050781,3.0



## 4. Output Generation
The output options are:
- Group Period Loss Table (GPLT)
  - full (all group_set_id) <-- current implementation
  - file based (each group_set_id in new file) <-- probably better
- Group Average Loss Table (GALT)
- Group Exceedance Probability Table (GEPT)

### GPLT output

In [26]:
sort_cols = ['group_set_id', 'output_set_id', 'SummaryId', 'GroupPeriod']
gplt_full.sort_values(by=sort_cols).to_csv(output_dir / "gplt_full.csv", index=False)
gplt_mean.sort_values(by=sort_cols).to_csv(output_dir / "gplt_mean.csv", index=False)

In [27]:
from ord_combining.grouped_output import generate_al, generate_ep

def save_output(full_df, output_dir, output_name, factor_col='group_set_id', float_format='%.6f'):
    for i in full_df[factor_col].unique():
        save_path = output_dir / f'{i}_{output_name}'
        full_df.query(f"{factor_col} == {i}").to_csv(save_path, index=False,
                                                     float_format=float_format)
        print('Saved: ', save_path)

### GALT Output


In [28]:
dtypes_aal = {
    'group_set_id': 'int',
    'SummaryId': 'int',
    'LossType': 'int',
    'Mean': 'float',
    'Std': 'float'
}

aal_full = generate_al(gplt_full, total_group_periods).astype(dtypes_aal)
aal_mean = generate_al(gplt_mean, total_group_periods).astype(dtypes_aal)

save_output(aal_full, output_dir, 'aal_full.csv')
save_output(aal_mean, output_dir, 'aal_mean.csv')

Saved:  combined_ord-091225153902/0_aal_full.csv
Saved:  combined_ord-091225153902/1_aal_full.csv
Saved:  combined_ord-091225153902/0_aal_mean.csv
Saved:  combined_ord-091225153902/1_aal_mean.csv


### GEPT Output

In [29]:
dtypes_ep = {
    'group_set_id': 'int',
    'SummaryId': 'int',
    'EPCalc': 'int',
    'EPType': 'int',
    'RP': 'float',
    'Loss': 'float'
}
ep_full_df = generate_ep(gplt_full, total_group_periods, oep=True, aep=True).astype(dtypes_ep)
ep_mean_df = generate_ep(gplt_mean, total_group_periods, oep=True, aep=True).astype(dtypes_ep)

save_output(ep_full_df, output_dir, 'ep_full.csv')
save_output(ep_mean_df, output_dir, 'ep_mean.csv')

Saved:  combined_ord-091225153902/0_ep_full.csv
Saved:  combined_ord-091225153902/1_ep_full.csv
Saved:  combined_ord-091225153902/0_ep_mean.csv
Saved:  combined_ord-091225153902/1_ep_mean.csv
