In [1]:
import syft as sy
import os
from syft import ActionObject
from collections import defaultdict



Start this using

```
hagrid launch domain to docker:8080 --dev --verbose
```

In [2]:
client = sy.login(url="http://localhost:80", email="info@openmined.org", password="changethis")

Logged into <high-side: High side Domain> as <info@openmined.org>


# Mount storage container with Helm azure container

In [3]:
client.api.services.blob_storage.mount_azure(
    account_name='helmprojectstorage',
    container_name='helm',
    account_key=os.environ["HELM_STORAGE_ACCOUNT_KEY"],
    bucket_name='helmazurebucket',
)

In [4]:
blob_files = client.api.services.blob_storage.get_files_from_bucket(bucket_name='helmazurebucket')

In [5]:
blob_files

# Start workers

In [6]:
client.worker.start_workers(n=3)

In [7]:
client.worker.list()

# Create Dataset

In [None]:
# train_file = sy.ActionObject.from_path("short_input.jsonl").send(client).syft_action_data
# scenario_file = scenario_obj = sy.ActionObject.from_path(path="scenario_data.jsonl").send(client).syft_action_data

In [8]:
train_file = [f for f in blob_files if "train-00" in f.file_name][0]
scenario_file = [f for f in blob_files if "scenario_data" in f.file_name][0]

In [9]:
helm_dataset = sy.Dataset(
    name="Helm Dataset",
    asset_list=[
        sy.Asset(
            name="helm train data",
            data=ActionObject.from_obj([train_file]),
            mock=sy.ActionObject.empty()
        ),
        sy.Asset(
            name="helm test data",
            data=ActionObject.from_obj([scenario_file]),
            mock=sy.ActionObject.empty()
        )
    ]
)

In [10]:
client.upload_dataset(helm_dataset)

Would you like to proceed? [y/n]: y



  0%|                                                             | 0/2 [00:00<?, ?it/s]

Uploading: helm train data



 50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.94it/s]

Uploading: helm test data


100%|█████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.50it/s]


In [11]:
helm_ds = client.datasets["Helm Dataset"]
helm_train_files = helm_ds.assets["helm train data"]
helm_test_files = helm_ds.assets["helm test data"]

# Syft functions

In [12]:
@sy.syft_function()
def compute_document_data_overlap(domain, scenario_file, input_files, n):
    print("starting overlap computation")

    from nltk import ngrams
    from collections import defaultdict
    from string import punctuation
    import re, json
    import time

    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    
    def create_ngram_index(light_scenarios, n_values, stats_key_counts):
        ngram_index = {n:{}  for n in n_values}
        for i, scenario in enumerate(light_scenarios):
            if i%20 == 0:
                print(f"n_gram indexing progress: {(i/len(light_scenarios))*100:.2f}%")
            for n in n_values:
                stats_key = scenario['scenario_key'] + '_' + str(n)
                stats_key_counts[stats_key] = len(scenario['instances'])
                for instance in scenario['instances']:
                    id = instance['id']                    
                    input_tokens = r.split(instance['input'].lower())
                    for input_ngram in ngrams(input_tokens, n):
                        if input_ngram not in ngram_index[n]:
                            ngram_index[n][input_ngram] = set()
                        ngram_index[n][input_ngram].add(stats_key + '+' + id + '+' + 'input')

                    # compute reference ngrams
                    for reference in instance['references']:
                        reference_unigrams = r.split(reference.lower())
                        for reference_ngram in ngrams(reference_unigrams, n):
                            if reference_ngram not in ngram_index[n]:
                                ngram_index[n][reference_ngram] = set()
                            ngram_index[n][reference_ngram].add(stats_key + '+' + id + '+' + 'references')
        return ngram_index
    
    # SETUP
    print("preparing scenarios and creating indexes")
    start = time.time()
    light_scenarios = []
    for i, (bytes_read, light_scenario_json) in enumerate(scenario_file.iter_lines(progress=True)):
        if i % 20 == 0:
            print(f"scenario creation progress: {(bytes_read/scenario_file.file_size)*100:.2f}%")

        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        scenario_spec = str(light_scenario_key_dict["scenario_spec"])

        light_scenario_key = scenario_spec + '_' + light_scenario_key_dict["split"]
        light_instances = [
            {
                'input': instance_dict['input'], 
                'references': instance_dict['references'], 
                'id': instance_dict["id"]
            }
            for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append({'scenario_key': light_scenario_key, 'instances': light_instances})
    print(f"Finished creating scenarios ({time.time()-start}s)")
    
    print("Creating indexes")
    
    start = time.time()
    stats_key_counts = defaultdict(int)
    ngram_index = create_ngram_index(
        light_scenarios=light_scenarios, n_values=[n], stats_key_counts=stats_key_counts
    )
    print(f"Finished creating indexes ({time.time()-start}s)")
        
    
    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    stats_key_to_input_ids = defaultdict(set)
    stats_key_to_reference_ids = defaultdict(set)
    print("computing overlap")
    start = time.time()
    
    domain.init_progress(input_files[0].file_size)

    for input_file in input_files:
        for i, (bytes_read, line) in enumerate(input_file.iter_lines(progress=True)):
            if i%1000 == 0:
                print(f"computing overlap progress: {(bytes_read / input_file.file_size) * 100:.2f}%")
                domain.set_progress(bytes_read)
            if i==10000:
                break
            document = json.loads(line)["text"]
            document_tokens = r.split(document.lower())
            for n in ngram_index.keys():
                for document_ngram in ngrams(document_tokens, n):
                    if document_ngram in ngram_index[n]:
                        for entry_overlap_key in ngram_index[n][document_ngram]:
                            stats_key, id, part = entry_overlap_key.split("+")
                            if part == "input":
                                stats_key_to_input_ids[stats_key].add(id)
                            elif part == "references":
                                stats_key_to_reference_ids[stats_key].add(id)
    print(f"Finished computing overlap ({time.time()-start}s)")
    print("done")
    
    return stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts

In [13]:
client.code.submit(compute_document_data_overlap)

In [14]:
@sy.syft_function_single_use(input_files=helm_train_files, scenario_files=helm_test_files)
def main_function(domain, input_files, scenario_files):
    N = [5, 9, 13]
    jobs = []
    for n in N[:1]:
        for scenario_file in scenario_files:
            batch_job = domain.launch_job(
                compute_document_data_overlap,
                scenario_file=scenario_file,
                input_files=input_files,
                n=n
            )
            jobs.append(batch_job)

    return None


In [15]:
client.code.request_code_execution(main_function)

In [16]:
client.requests[-1].approve()

Would you like to proceed? [y/n]: y
Request approved for domain high-side


In [17]:
job = client.code.main_function(input_files=helm_train_files,
                                scenario_files=helm_test_files,
                                blocking=False)

# Inspect Jobs and get results

In [18]:
job

```python
class Job:
    id: UID = 4686a91d787a4a49ab811e79915d7382
    status: completed
    has_parent: False
    result: ActionDataEmpty UID: 3b9dc59f63bf493cbe3d39360d0f5627 <None>
    logs:

0 
JOB COMPLETED
    
```

In [19]:
# job.subjobs

In [48]:
job.subjobs[0]

```python
class Job:
    id: UID = 7e0f18c3c2be474ba3a6997201610388
    status: processing
    has_parent: True
    result: None
    logs:

0 starting overlap computation
1 preparing scenarios and creating indexes
2 scenario creation progress: 0.00%
3 scenario creation progress: 1.51%
4 scenario creation progress: 8.62%
5 scenario creation progress: 16.41%
6 scenario creation progress: 20.85%
7 scenario creation progress: 32.16%
8 scenario creation progress: 48.00%
9 scenario creation progress: 61.55%
10 scenario creation progress: 71.17%
11 scenario creation progress: 87.95%
12 scenario creation progress: 95.40%
13 scenario creation progress: 97.74%
14 scenario creation progress: 100.00%
15 Finished creating scenarios (1838.3025119304657s)
16 Creating indexes
17 n_gram indexing progress: 0.00%
18 n_gram indexing progress: 8.30%
19 n_gram indexing progress: 16.60%
20 n_gram indexing progress: 24.90%
21 n_gram indexing progress: 33.20%
22 n_gram indexing progress: 41.49%
23 n_gram indexing progress: 49.79%
24 n_gram indexing progress: 58.09%
25 n_gram indexing progress: 66.39%
26 n_gram indexing progress: 74.69%
27 n_gram indexing progress: 82.99%
28 n_gram indexing progress: 91.29%
29 n_gram indexing progress: 99.59%
30 Finished creating indexes (658.1834075450897s)
31 computing overlap
    
```

In [36]:
# job.wait().get()

In [45]:
job.subjobs[0].logs()

starting overlap computation
preparing scenarios and creating indexes
scenario creation progress: 0.00%
scenario creation progress: 1.51%
scenario creation progress: 8.62%
scenario creation progress: 16.41%
scenario creation progress: 20.85%
scenario creation progress: 32.16%
scenario creation progress: 48.00%
scenario creation progress: 61.55%
scenario creation progress: 71.17%
scenario creation progress: 87.95%
scenario creation progress: 95.40%
scenario creation progress: 97.74%
scenario creation progress: 100.00%
Finished creating scenarios (1838.3025119304657s)
Creating indexes
n_gram indexing progress: 0.00%
n_gram indexing progress: 8.30%



In [42]:
results = [j.wait().get() for j in job.subjobs]

In [43]:
#stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts
results

In [44]:
# results[0]

# Aggregate

In [55]:
stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts = zip(*results)

total_input_ids = defaultdict(set)
total_reference_ids = defaultdict(set)
total_stats_key_counts = defaultdict(int)

for d in stats_key_counts:
    for key, val in d.items():
        total_stats_key_counts[key] += val


for d in stats_key_to_input_ids:
    for key in d:
        new_set = set()
        if key in total_input_ids:
            new_set = total_input_ids[key]
        new_set = new_set.union(d[key])
        total_input_ids[key] = new_set

for d in stats_key_to_reference_ids:
    for key in d:
        new_set = set()
        if key in total_reference_ids:
            new_set = total_reference_ids[key]
        new_set = total_reference_ids[key].union(d[key])
        total_reference_ids[key] = new_set

all_data_overlap_stats = []
for stats_key, count in total_stats_key_counts.items():
    data_overlap_stats = {
        'data_overlap_stats_key': None,
        'num_instances': count,
        'instance_ids_with_overlapping_input': sorted(total_input_ids[stats_key]),
        'instance_ids_with_overlapping_reference': sorted(total_reference_ids[stats_key]),
    }
    subject, split, n_str = stats_key.rsplit('_', 2)
    data_overlap_stats['data_overlap_stats_key'] = {
        'light_scenario_key': {'scenario_spec': subject, 'split': split},
        'overlap_protocol_spec': {'n': int(n_str)}
    }
    all_data_overlap_stats.append(data_overlap_stats)


In [56]:
from pprint import pprint
pprint(all_data_overlap_stats)

[{'data_overlap_stats_key': {'light_scenario_key': {'scenario_spec': "{'class_name': "
                                                                     "'helm.benchmark.scenarios.math_scenario.MATHScenario', "
                                                                     "'args': "
                                                                     "{'subject': "
                                                                     "'number_theory', "
                                                                     "'level': "
                                                                     '1, '
                                                                     "'use_official_examples': "
                                                                     'True, '
                                                                     "'use_chain_of_thought': "
                                                                     'False}}',
                          