In [37]:
import syft as sy
from syft import ActionObject
from syft.service.action.action_data_empty import ActionFileData
from collections import defaultdict

Start this using

`docker compose --profile blob-storage --file docker-compose.multinode.yml --file docker-compose.dev.yml up`

In [None]:
client = sy.login(url="http://localhost:80", email="info@openmined.org", password="changethis")

In [3]:
client.worker.start_workers(n=3)

In [4]:
client.worker.list()

In [13]:
client.users

In [14]:
scenario_obj = ActionObject.from_obj([
    sy.ActionObject.from_path(path="scenario_data.jsonl").send(client).syft_action_data for i in range(2)])
scenario_files_ptr = scenario_obj.send(client)

In [15]:
# TODO: fix way we send list of files
input_obj = ActionObject.from_obj([
    sy.ActionObject.from_path("short_input.jsonl").send(client).syft_action_data for i in range(2)])
input_files_ptr = input_obj.send(client)

# Syft functions

In [16]:
@sy.syft_function()
def compute_document_data_overlap(scenario_file, input_files, n):
    print("starting overlap computation")

    from nltk import ngrams
    from collections import defaultdict
    from string import punctuation
    import re, json

    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    
    def create_ngram_index(light_scenarios, n_values, stats_key_counts):
        ngram_index = {n:{}  for n in n_values}
        for scenario in light_scenarios:
            for n in n_values:
                stats_key = scenario['scenario_key'] + '_' + str(n)
                stats_key_counts[stats_key] = len(scenario['instances'])
                for instance in scenario['instances']:
                    id = instance['id']                    
                    input_tokens = r.split(instance['input'].lower())
                    for input_ngram in ngrams(input_tokens, n):
                        if input_ngram not in ngram_index[n]:
                            ngram_index[n][input_ngram] = set()
                        ngram_index[n][input_ngram].add(stats_key + '+' + id + '+' + 'input')

                    # compute reference ngrams
                    for reference in instance['references']:
                        reference_unigrams = r.split(reference.lower())
                        for reference_ngram in ngrams(reference_unigrams, n):
                            if reference_ngram not in ngram_index[n]:
                                ngram_index[n][reference_ngram] = set()
                            ngram_index[n][reference_ngram].add(stats_key + '+' + id + '+' + 'references')
        return ngram_index
    
    # # SETUP
    print("preparing scenarios and creating indexes")
    light_scenarios = []
    for light_scenario_json in scenario_file.iter_lines():
        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        subject_spec = light_scenario_key_dict["scenario_spec"]['args']['subject']
        light_scenario_key = subject_spec + '_' + light_scenario_key_dict["split"]
        light_instances = [
            {
                'input': instance_dict['input'], 
                'references': instance_dict['references'], 
                'id': instance_dict["id"]
            }
            for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append({'scenario_key': light_scenario_key, 'instances': light_instances})
        
    stats_key_counts = defaultdict(int)
    
    ngram_index = create_ngram_index(
        light_scenarios=light_scenarios, n_values=[n], stats_key_counts=stats_key_counts
    )
    
    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    stats_key_to_input_ids = defaultdict(set)
    stats_key_to_reference_ids = defaultdict(set)
    print("computing overlap")
    
    for input_file in input_files:
        for line in input_file.iter_lines():
            document = json.loads(line)["text"]
            document_tokens = r.split(document.lower())
            for n in ngram_index.keys():
                for document_ngram in ngrams(document_tokens, n):
                    if document_ngram in ngram_index[n]:
                        for entry_overlap_key in ngram_index[n][document_ngram]:
                            stats_key, id, part = entry_overlap_key.split("+")
                            if part == "input":
                                stats_key_to_input_ids[stats_key].add(id)
                            elif part == "references":
                                stats_key_to_reference_ids[stats_key].add(id)
    print("done")
    
    return stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts

In [17]:
client.code.submit(compute_document_data_overlap)

In [18]:
@sy.syft_function_single_use(input_files=input_files_ptr, scenario_files=scenario_files_ptr)
def main_function(domain, input_files, scenario_files):
    N = [5, 9, 13]
    jobs = []
    for n in N[:1]:
        for scenario_file in scenario_files:
            batch_job = domain.launch_job(
                compute_document_data_overlap,
                scenario_file=scenario_file,
                input_files=input_files,
                n=n
            )
            jobs.append(batch_job)

    return None


In [19]:
client.code.request_code_execution(main_function)

In [20]:
client.requests[-1].approve()

Would you like to proceed? [y/n]: y
Request approved for domain high-side


In [21]:
job = client.code.main_function(input_files=input_files_ptr, scenario_files=scenario_files_ptr, blocking=False)

# Get results

In [25]:
job

```python
class Job:
    id: UID = d8a2c1f48d814f81b623e85a70eacb17
    status: completed
    has_parent: False
    result: ActionDataEmpty UID: dd13e556f95543b19c121b6e45b1d92f <None>
    logs:

0 
JOB COMPLETED
    
```

In [27]:
job.subjobs

In [28]:
job.subjobs[1]

```python
class Job:
    id: UID = d2caa897c3a54720a659477b748cb49e
    status: completed
    has_parent: True
    result: ActionDataEmpty UID: e70442648358424faf23d0660626f71d <None>
    logs:

0 starting overlap computation
1 preparing scenarios and creating indexes
2 computing overlap
3 done
JOB COMPLETED
    
```

In [29]:
job.wait().get()

In [30]:
job.subjobs

In [31]:
job.subjobs[0].logs()

starting overlap computation
preparing scenarios and creating indexes
computing overlap
done



In [32]:
results = [j.wait().get() for j in job.subjobs]

In [33]:
#stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts
results

In [34]:
results[0]

(defaultdict(set,
             {'philosophy_test_5': {'id328'}, 'philosophy_valid_5': {'id12'}}),
 defaultdict(set, {}),
 defaultdict(int,
             {'philosophy_train_5': 5,
              'philosophy_valid_5': 34,
              'philosophy_test_5': 311,
              'anatomy_train_5': 5,
              'anatomy_valid_5': 14,
              'anatomy_test_5': 135}))

# Aggregate

In [35]:
stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts = zip(*results)

total_input_ids = defaultdict(set)
total_reference_ids = defaultdict(set)
total_stats_key_counts = defaultdict(int)

for d in stats_key_counts:
    for key, val in d.items():
        total_stats_key_counts[key] += val


for d in stats_key_to_input_ids:
    for key in d:
        new_set = set()
        if key in total_input_ids:
            new_set = total_input_ids[key]
        new_set = new_set.union(d[key])
        total_input_ids[key] = new_set

for d in stats_key_to_reference_ids:
    for key in d:
        new_set = set()
        if key in total_reference_ids:
            new_set = total_reference_ids[key]
        new_set = total_reference_ids[key].union(d[key])
        total_reference_ids[key] = new_set

all_data_overlap_stats = []
for stats_key, count in total_stats_key_counts.items():
    data_overlap_stats = {
        'data_overlap_stats_key': None,
        'num_instances': count,
        'instance_ids_with_overlapping_input': sorted(total_input_ids[stats_key]),
        'instance_ids_with_overlapping_reference': sorted(total_reference_ids[stats_key]),
    }
    subject, split, n_str = stats_key.split('_')
    data_overlap_stats['data_overlap_stats_key'] = {
        'light_scenario_key': {'subject': subject, 'split': split},
        'overlap_protocol_spec': {'n': int(n_str)}
    }
    all_data_overlap_stats.append(data_overlap_stats)


In [36]:
from pprint import pprint
pprint(all_data_overlap_stats)

[{'data_overlap_stats_key': {'light_scenario_key': {'split': 'train',
                                                    'subject': 'philosophy'},
                             'overlap_protocol_spec': {'n': 5}},
  'instance_ids_with_overlapping_input': [],
  'instance_ids_with_overlapping_reference': [],
  'num_instances': 10},
 {'data_overlap_stats_key': {'light_scenario_key': {'split': 'valid',
                                                    'subject': 'philosophy'},
                             'overlap_protocol_spec': {'n': 5}},
  'instance_ids_with_overlapping_input': ['id12'],
  'instance_ids_with_overlapping_reference': [],
  'num_instances': 68},
 {'data_overlap_stats_key': {'light_scenario_key': {'split': 'test',
                                                    'subject': 'philosophy'},
                             'overlap_protocol_spec': {'n': 5}},
  'instance_ids_with_overlapping_input': ['id328'],
  'instance_ids_with_overlapping_reference': [],
  'num_instances': 