In [1]:
import syft as sy
from syft.store.blob_storage import BlobStorageConfig, BlobStorageClientConfig
from syft.store.blob_storage.seaweedfs import SeaweedFSClient, SeaweedFSClientConfig
from syft import ActionObject
from syft.service.action.action_data_empty import ActionFileData



In [2]:
node = sy.orchestra.launch(name="test-domain-helm2", dev_mode=True, reset=True, n_consumers=6)
client = node.login(email="info@openmined.org", password="changethis")

CREATING A PRODUCER ON tcp://127.0.0.1:34467
CREATING A CONSUMER ON tcp://127.0.0.1:36427
spawning thread
CREATING A CONSUMER ON tcp://127.0.0.1:36427
spawning thread
CREATING A CONSUMER ON tcp://127.0.0.1:36427
spawning thread
CREATING A CONSUMER ON tcp://127.0.0.1:36427
spawning thread
CREATING A CONSUMER ON tcp://127.0.0.1:36427
spawning thread
CREATING A CONSUMER ON tcp://127.0.0.1:36427
spawning thread
Logged into <test-domain-helm2: High side Domain> as <info@openmined.org>


```bash
docker run --entrypoint /bin/sh -p 8333:8333 -p 8888:8888 chrislusf/seaweedfs -c "echo 's3.configure -access_key admin -secret_key admin -user iam -actions Read,Write,List,Tagging,Admin -apply' | weed shell > /dev/null 2>&1 & weed server -s3 -s3.port=8333 -master.volumeSizeLimitMB=2048"
```

In [3]:
blob_config = BlobStorageConfig(client_type=SeaweedFSClient,
                                client_config=SeaweedFSClientConfig(host="http://0.0.0.0",
                                                                    port="8333",
                                                                    access_key="admin",
                                                                    secret_key="admin",
                                                                    bucket_name="test_bucket",
                                                                    region="us-east-1")
)

In [4]:
node.python_node.init_blob_storage(blob_config)

In [5]:
scenario_obj = sy.ActionObject.from_path(path="/home/teo/helm/scripts/data_overlap/scenario_data.jsonl")
scenario_ptr = scenario_obj.send(client)

In [6]:
input_obj = sy.ActionObject.from_path("short_input.jsonl")
input_ptr = input_obj.send(client)

In [7]:
for line in input_ptr.syft_action_data.iter_lines():
    print(line)



In [8]:
@sy.syft_function()
def compute_document_data_overlap(document, ngram_index):
    from nltk import ngrams
    from collections import defaultdict
    import re
    from string import punctuation
    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    stats_key_to_input_ids = defaultdict(set)
    stats_key_to_reference_ids = defaultdict(set)
    document_tokens = r.split(document.lower())
    for n in ngram_index.keys():
        for document_ngram in ngrams(document_tokens, n):
            if document_ngram in ngram_index[n]:
                for entry_overlap_key in ngram_index[n][document_ngram]:
                    stats_key, id, part = entry_overlap_key.split("+")
                    if part == "input":
                        stats_key_to_input_ids[stats_key].add(id)
                    elif part == "references":
                        stats_key_to_reference_ids[stats_key].add(id)
    return stats_key_to_input_ids, stats_key_to_reference_ids

In [9]:
client.code.submit(compute_document_data_overlap)

In [10]:
@sy.syft_function_single_use(input_file=input_ptr, scenario_file=scenario_ptr)
def main_function(domain, input_file, scenario_file):
    import re
    from string import punctuation
    import json
    from nltk import ngrams
    from collections import defaultdict
    N = [5, 9, 13]
    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))

    def create_ngram_index(light_scenarios, n_values, stats_key_counts):
        ngram_index = {n:{}  for n in n_values}
        for scenario in light_scenarios:
            for n in n_values:
                stats_key = scenario['scenario_key'] + '_' + str(n)
                stats_key_counts[stats_key] = len(scenario['instances'])
                for instance in scenario['instances']:
                    id = instance['id']                    
                    input_tokens = r.split(instance['input'].lower())
                    for input_ngram in ngrams(input_tokens, n):
                        if input_ngram not in ngram_index[n]:
                            ngram_index[n][input_ngram] = set()
                        ngram_index[n][input_ngram].add(stats_key + '+' + id + '+' + 'input')

                    # compute reference ngrams
                    for reference in instance['references']:
                        reference_unigrams = r.split(reference.lower())
                        for reference_ngram in ngrams(reference_unigrams, n):
                            if reference_ngram not in ngram_index[n]:
                                ngram_index[n][reference_ngram] = set()
                            ngram_index[n][reference_ngram].add(stats_key + '+' + id + '+' + 'references')
        return ngram_index

    # # SETUP
    light_scenarios = []
    light_scenario_jsons = scenario_file.iter_lines()
    for light_scenario_json in light_scenario_jsons:
        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        subject_spec = light_scenario_key_dict["scenario_spec"]['args']['subject']
        light_scenario_key = subject_spec + '_' + light_scenario_key_dict["split"]
        light_instances = [
            {
                'input': instance_dict['input'], 
                'references': instance_dict['references'], 
                'id': instance_dict["id"]
            }
            for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append({'scenario_key': light_scenario_key, 'instances': light_instances})

    stats_key_counts = defaultdict(int)
    ngram_index = create_ngram_index(
        light_scenarios=light_scenarios, n_values=N, stats_key_counts=stats_key_counts
    )

    # BATCH PROCESSING
    jobs = []
    for line in input_file.iter_lines():
        document = json.loads(line)["text"]
        batch_job = domain.launch_job(compute_document_data_overlap,
            document=document,
            ngram_index=ngram_index,
        )
        jobs.append(batch_job)
    
    # AGGREGATION
    stats_key_to_input_ids = []
    stats_key_to_reference_ids = []
    tmp_results = [x.wait() for x in jobs]
        
    results = [x.get() for x in tmp_results]
    for ids, refs in results:
        stats_key_to_input_ids.append(ids)
        stats_key_to_reference_ids.append(refs)

    total_input_ids = defaultdict(set)
    total_reference_ids = defaultdict(set)
    
    for d in stats_key_to_input_ids:
        for key in d:
            new_set = set()
            if key in total_input_ids:
                new_set = total_input_ids[key]
            new_set = new_set.union(d[key])
            total_input_ids[key] = new_set

    for d in stats_key_to_reference_ids:
        for key in d:
            new_set = set()
            if key in total_reference_ids:
                new_set = total_reference_ids[key]
            new_set = total_reference_ids[key].union(d[key])
            total_reference_ids[key] = new_set
    
    all_data_overlap_stats = []
    for stats_key, count in stats_key_counts.items():
        data_overlap_stats = {
            'data_overlap_stats_key': None,
            'num_instances': count,
            'instance_ids_with_overlapping_input': sorted(total_input_ids[stats_key]),
            'instance_ids_with_overlapping_reference': sorted(total_reference_ids[stats_key]),
        }
        subject, split, n_str = stats_key.split('_')
        data_overlap_stats['data_overlap_stats_key'] = {
            'light_scenario_key': {'subject': subject, 'split': split},
            'overlap_protocol_spec': {'n': int(n_str)}
        }
        all_data_overlap_stats.append(data_overlap_stats)


    return all_data_overlap_stats

In [11]:
client.code.request_code_execution(main_function)
client.requests[-1].approve()

Request approved for domain test-domain-helm2


In [12]:
job = client.code.main_function(input_file=input_ptr, scenario_file=scenario_ptr, blocking=False)


In [13]:
job.logs()




In [14]:
job.wait().get()

PTR OK: True
PTR OK: True
What: syft.service.request.request.Request
Args: {'document': <UID: c7afe443d56f48809331e22d18522865>, 'ngram_index': <UID: b036f5e8c1fb42249743341ab23513c5>}
PTR OK: True
PTR OK: True
What: syft.service.request.request.Request
Args: {'document': <UID: 3b03580298314af3b45e1e470e2ed32e>, 'ngram_index': <UID: 0235098138ac4b738ae89b9cacb3dc51>}
PTR OK: True
PTR OK: True
What: syft.service.request.request.Request
Args: {'document': <UID: 46745453ed2849e99a876a9dc092e39d>, 'ngram_index': <UID: 1228b4dd25b846c98627d5eac10a4ea7>}
