In [None]:
import json 
from gufe.tokenization import JSON_HANDLER
import os 
import shutil
from pathlib import Path

for this dataset, we know we have 3 replicates run in serial for each leg. We want to manipulate the data so that it is equivalent to the output if we re-ran this dataset with each leg run in parallel, with the following directory structure:

```
results/
  transformations_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/
          shared_[hashA]_attempt_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json
  transformations_1/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/
          shared_[hashB]_attempt_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json
  transformations_2/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/
          shared_[hashC]_attempt_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json
```

In [None]:
def load_json(fpath):
    return json.load(open(fpath, 'r'), cls=JSON_HANDLER.decoder)

def dump_json(data, fpath):
    with open(fpath, "w") as f:
        json.dump(data, f, cls=JSON_HANDLER.encoder)

In [None]:
orig_dir = Path("/Users/atravitz/software/openfe/openfecli/tests/data/results/")
new_dir = Path("/Users/atravitz/work/sandbox/gather_test_data/results_parallel")

In [None]:
# ! ls $orig_dir

In [None]:
tmp_data = load_json(orig_dir/"easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json")

In [None]:
tmp_data.keys()

In [None]:
tmp_data['protocol_result']['data'].keys()

In [None]:
leg_names = []
for name in os.listdir(orig_dir):
    if name.endswith(".json"):
        continue
    leg_names.append(name)
leg_names

In [None]:
! rm -rf $new_dir

In [None]:
for leg in leg_names:
    json_data = load_json(orig_dir/f"{leg}.json")
    srckey_to_protocol = {}
    srckey_to_unit_results = {}
    srckey_to_estimate = {}
    ## collect results on a per-replicate basis
    for k in json_data['protocol_result']['data']:    
        print(k)
        rep_source_key = json_data['protocol_result']['data'][k][0]['source_key']
        
        # keep only the data for this replicate
        rep_result = json_data['protocol_result'].copy()
        rep_result['data']={k:json_data['protocol_result']['data'][k]}
        srckey_to_protocol[rep_source_key] = rep_result

        # pull just the estimate value so we can put it at the top of the output
        srckey_to_estimate[rep_source_key] = rep_result['data'][k][0]['outputs']['unit_estimate']
        
    for k in json_data['unit_results']:
        rep_source_key = json_data['unit_results'][k]['source_key']

        rep_unit_result = json_data['unit_results'].copy()
        rep_unit_result = {k: json_data['unit_results'][k]}
        srckey_to_unit_results[rep_source_key] = rep_unit_result
    
    assert srckey_to_protocol.keys() == srckey_to_unit_results.keys()
    
    ## write to the new directory
    for n, sk in enumerate(sorted(srckey_to_protocol.keys())):
        rep_dir = new_dir/f"replicate_{n}"
        os.makedirs(rep_dir/leg)
    
        # build up the data for this replicate
        replicate_data = {'estimate': srckey_to_estimate[sk],
                          'uncertainty': 0.0,
                          'protocol_result': srckey_to_protocol[sk],
                          'unit_results': srckey_to_unit_results[sk]}
    
        # write!
        dump_json(replicate_data, rep_dir/f"{leg}.json")
        working_dir_name = f"shared_{sk}_attempt_0"
        ## TODO: make this work for arbitrary number of attempts 
        # os.symlink(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)
        shutil.copytree(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)


In [None]:
tmp_data = load_json("/Users/atravitz/work/sandbox/gather_test_data/complex_11_13_all_reps.json/results_transformations_0/complex_11_13.json")

In [None]:
json_data.keys()


In [None]:
json_data['unit_results']['ProtocolUnitResult-1a84266f25684314abb1c5ab94bd2932'].keys()

In [None]:
tmp_data['protocol_result']['data']['153739774597735519755877734364465126162'][0]['outputs']['unit_estimate'].keys()

In [None]:
json_data['protocol_result']['data']['206613422407723609770924271218313554331'][0]['outputs']['unit_estimate']

In [None]:
json_data['protocol_result']['data'].keys()

In [None]:
single_rep = load_json("/Users/atravitz/work/sandbox/gather_test_data/complex_11_13_all_reps.json/results_transformations_0/complex_11_13.json")

In [None]:
single_rep['protocol_result']['data']['153739774597735519755877734364465126162'][0]['outputs']['unit_estimate']

In [None]:
json_data['protocol_result']['data']['206613422407723609770924271218313554331'][0].keys()

In [None]:
json_data['protocol_result']['data'].keys()