In [2]:
import json 
from gufe.tokenization import JSON_HANDLER
import os 
import shutil
from pathlib import Path

for this dataset, we know we have 3 replicates run in serial for each leg. We want to manipulate the data so that it is equivalent to the output if we re-ran this dataset with each leg run in parallel, with the following directory structure:

```
results/
  transformations_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/
          shared_[hashA]_attempt_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json
  transformations_1/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/
          shared_[hashB]_attempt_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json
  transformations_2/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/
          shared_[hashC]_attempt_0/
      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json
```

In [3]:
def load_json(fpath):
    return json.load(open(fpath, 'r'), cls=JSON_HANDLER.decoder)

def dump_json(data, fpath):
    with open(fpath, "w") as f:
        json.dump(data, f, cls=JSON_HANDLER.encoder)

In [4]:
orig_dir = Path("/Users/atravitz/software/openfe/openfecli/tests/data/results/")
new_dir = Path("/Users/atravitz/work/sandbox/gather_test_data/results_parallel")

In [5]:
# ! ls $orig_dir

In [6]:
tmp_data = load_json(orig_dir/"easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json")

In [7]:
tmp_data.keys()

dict_keys(['estimate', 'uncertainty', 'protocol_result', 'unit_results'])

In [8]:
tmp_data['protocol_result']['data'].keys()

dict_keys(['330714296593777210386804356517366745081', '204826840445410897858355357525897940103', '17154838837715859465403535134173639142'])

In [10]:
leg_names = []
for name in os.listdir(orig_dir):
    if name.endswith(".json"):
        continue
    leg_names.append(name)
leg_names

['easy_rbfe_lig_ejm_31_solvent_lig_ejm_47_solvent',
 'easy_rbfe_lig_ejm_31_complex_lig_ejm_47_complex',
 'easy_rbfe_lig_ejm_31_solvent_lig_ejm_46_solvent',
 'easy_rbfe_lig_ejm_31_complex_lig_ejm_46_complex',
 'easy_rbfe_lig_ejm_46_complex_lig_jmc_27_complex',
 'easy_rbfe_lig_ejm_46_solvent_lig_jmc_27_solvent',
 'easy_rbfe_lig_ejm_31_complex_lig_ejm_50_complex',
 'easy_rbfe_lig_ejm_31_solvent_lig_ejm_50_solvent',
 'easy_rbfe_lig_ejm_46_solvent_lig_jmc_23_solvent',
 'easy_rbfe_lig_ejm_31_complex_lig_ejm_48_complex',
 'easy_rbfe_lig_ejm_46_complex_lig_jmc_23_complex',
 'easy_rbfe_lig_ejm_31_solvent_lig_ejm_48_solvent',
 'easy_rbfe_lig_ejm_42_solvent_lig_ejm_43_solvent',
 'easy_rbfe_lig_ejm_42_complex_lig_ejm_43_complex',
 'easy_rbfe_lig_ejm_46_solvent_lig_jmc_28_solvent',
 'easy_rbfe_lig_ejm_46_complex_lig_jmc_28_complex',
 'easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex',
 'easy_rbfe_lig_ejm_31_solvent_lig_ejm_42_solvent']

In [15]:
! rm -rf $new_dir

In [16]:
for leg in leg_names:
# leg = leg_names[0]
    json_data = load_json(orig_dir/f"{leg}.json")
    srckey_to_protocol = {}
    srckey_to_units = {}
    
    ## collect results on a per-replicate basis
    for k in json_data['protocol_result']['data']:    
        rep_source_key = json_data['protocol_result']['data'][k][0]['source_key']
        srckey_to_protocol[rep_source_key] = json_data['protocol_result']
        
    for k in json_data['unit_results']:
        rep_source_key = json_data['unit_results'][k]['source_key']
        srckey_to_units[rep_source_key] = json_data['unit_results'][k]
    
    assert srckey_to_protocol.keys() == srckey_to_units.keys()
    
    ## write to the new directory
    for n, sk in enumerate(sorted(srckey_to_protocol.keys())):
        rep_dir = new_dir/f"replicate_{n}"
        os.makedirs(rep_dir/leg)
    
        # build up the data for this replicate
        replicate_data = {'estimate':'NA',
                          'uncertainty':'NA',
                          'protocol_result':srckey_to_protocol[sk],
                          'unit_results':srckey_to_units[sk]}
    
        # write!
        dump_json(replicate_data, rep_dir/f"{leg}.json")
        working_dir_name = f"shared_{sk}_attempt_0"
        ## TODO: make this work for arbitrary number of attempts 
        # os.symlink(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)
        shutil.copytree(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)


0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
