In [1]:
# stdlib

# stdlib
import os

# syft absolute
import syft as sy
from syft import ActionObject

Start this using

```
hagrid launch domain to docker:8080 --dev --verbose
```

In [2]:
client = sy.login(
    url="http://localhost:8080", email="info@openmined.org", password="changethis"
)

Logged into <test: High side Domain> as <info@openmined.org>


# Mount storage container with Helm azure container OR upload files

In [3]:
mount = True

In [4]:
if mount:
    client.api.services.blob_storage.mount_azure(
        account_name="helmprojectstorage",
        container_name="helm",
        account_key=os.environ["HELM_STORAGE_ACCOUNT_KEY"],
        bucket_name="helmazurebucket",
    )
    blob_files = client.api.services.blob_storage.get_files_from_bucket(
        bucket_name="helmazurebucket"
    )
    display(blob_files)

    test_files = [
        b for b in blob_files if b.file_name == "filtered_scenario_data_new.jsonl"
    ]
    train_files = [b for b in blob_files if "train-" in b.file_name][
        :1
    ]  # SELECTING ONLY THE FIRST HERE FOR TESTING
    test_files_ptr = ActionObject.from_obj(test_files)
    train_files_ptr = ActionObject.from_obj(train_files)

if you dont want to mount, you need to upload. In order to upload you first need to download. You can download files from here

You need a test set (filtered_scenario_data_new.jsonl) and at least one train file (train-00.jsonl)

https://portal.azure.com/#view/Microsoft_Azure_Storage/ContainerMenuBlade/~/overview/storageAccountId/%2Fsubscriptions%2Ff1ade722-e418-4647-8094-3f73664dd6a5%2FresourceGroups%2Fopenmined%2Fproviders%2FMicrosoft.Storage%2FstorageAccounts%2Fhelmprojectstorage/path/helm/etag/%220x8DBCF2D8F549160%22/defaultEncryptionScope/%24account-encryption-key/denyEncryptionScopeOverride~/false/defaultId//publicAccessVal/None

In [4]:
mount = False

In [5]:
if not mount:
    train_files_ptr = client.upload_files("short_input.jsonl")
    test_files_ptr = client.upload_files(
        "scenario_data.jsonl"
    )  # ONLY SELECTING A SINGLE FILE FOR TESTING

Uploading 1 file:
Uploading short_input.jsonl


Uploading progress: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.51it/s]
Uploading progress: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 35.17it/s]


Uploading 1 file:
Uploading scenario_data.jsonl


Uploading progress: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.61it/s]
Uploading progress: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 87.00it/s]


If you want to test whether you uploaded correctly, you can use the following:

In [6]:
if False:
    for x in train_files_ptr[0].syft_action_data.iter_lines():
        print(x)
        break

# Create Dataset

In [7]:
helm_dataset = sy.Dataset(
    name="Helm Dataset",
    asset_list=[
        sy.Asset(
            name="helm train data",
            data=train_files_ptr,
            mock=sy.ActionObject.empty(),
        ),
        sy.Asset(
            name="helm test data",
            data=ActionObject.from_obj(test_files_ptr),
            mock=sy.ActionObject.empty(),
        ),
    ],
)

In [8]:
client.upload_dataset(helm_dataset)

Would you like to proceed? [y/n]: 
Invalid response. Please enter Y or N.
Would you like to proceed? [y/n]: y



  0%|                                                                                                                 | 0/2 [00:00<?, ?it/s]

Uploading: helm train data



Uploading progress: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 32.22it/s][A
 50%|████████████████████████████████████████████████████▌                                                    | 1/2 [00:00<00:00,  1.44it/s]

Uploading: helm test data



Uploading progress: 100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.17it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.94it/s]


In [9]:
helm_ds = client.datasets["Helm Dataset"]
helm_train_files = helm_ds.assets["helm train data"]
helm_test_files = helm_ds.assets["helm test data"]

# Start workers

In [10]:
# Update this to the latest tag
# Refer to :https://hub.docker.com/r/openmined/grid-backend/tags
om_backend_tag = "0.8.4-beta.15"

custom_dockerfile_str = f"""
FROM openmined/grid-backend:{om_backend_tag}
RUN pip install nltk
"""

image_config = sy.DockerWorkerConfig(
    dockerfile=custom_dockerfile_str,
    description="This image install nltk for tokenization and working with text based dataset.",
)

In [11]:
worker_pool_name = "helm-workerpool"

request = client.worker_pools.create_image_and_pool_request(
    pool_name=worker_pool_name,
    num_workers=3,
    tag="helm/nltk-om-image:latest",
    config=image_config,
    reason="A worker pool with nltk installed for the Helm dataset",
)

In [12]:
request.approve()

Would you like to proceed? [y/n]: y
Approving request for domain test


In [20]:
client.worker

In [21]:
# client.worker.start_workers(n=3)

**WE NEED TO WAIT HERE UNTIL ALL WORKERS ARE "IDLE"**

In [22]:
assert all(
    w.consumer_state.value == "Idle" for w in client.worker
), "Not ready, wait untill this does not error"

# Syft functions

In [23]:
@sy.syft_function(worker_pool_name=worker_pool_name)
def compute_document_data_overlap(domain, scenario_file, input_files, n):
    print("starting overlap computation")

    # stdlib
    from collections import defaultdict
    import json
    import re
    from string import punctuation
    import time

    # third party
    from nltk import ngrams

    r = re.compile(rf"[\s{re.escape(punctuation)}]+")

    def create_ngram_index(light_scenarios, n_values, stats_key_counts):
        ngram_index = {n: {} for n in n_values}
        for i, scenario in enumerate(light_scenarios):
            if i % 20 == 0:
                print(f"n_gram indexing progress: {(i/len(light_scenarios))*100:.2f}%")
            for n in n_values:
                stats_key = scenario["scenario_key"] + "_" + str(n)
                stats_key_counts[stats_key] = len(scenario["instances"])
                for instance in scenario["instances"]:
                    id = instance["id"]
                    input_tokens = r.split(instance["input"].lower())
                    for input_ngram in ngrams(input_tokens, n):
                        if input_ngram not in ngram_index[n]:
                            ngram_index[n][input_ngram] = set()
                        ngram_index[n][input_ngram].add(
                            stats_key + "+" + id + "+" + "input"
                        )

                    # compute reference ngrams
                    for reference in instance["references"]:
                        reference_unigrams = r.split(reference.lower())
                        for reference_ngram in ngrams(reference_unigrams, n):
                            if reference_ngram not in ngram_index[n]:
                                ngram_index[n][reference_ngram] = set()
                            ngram_index[n][reference_ngram].add(
                                stats_key + "+" + id + "+" + "references"
                            )
        return ngram_index

    # SETUP
    print("preparing scenarios and creating indexes")
    start = time.time()
    light_scenarios = []
    for i, (bytes_read, light_scenario_json) in enumerate(
        scenario_file.iter_lines(progress=True)
    ):
        if i % 20 == 0:
            print(
                f"scenario creation progress: {(bytes_read/scenario_file.file_size)*100:.2f}%"
            )

        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        scenario_spec = str(light_scenario_key_dict["scenario_spec"])

        light_scenario_key = scenario_spec + "_" + light_scenario_key_dict["split"]
        light_instances = [
            {
                "input": instance_dict["input"],
                "references": instance_dict["references"],
                "id": instance_dict["id"],
            }
            for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append(
            {"scenario_key": light_scenario_key, "instances": light_instances}
        )
    print(f"Finished creating scenarios ({time.time()-start}s)")

    print("Creating indexes")

    start = time.time()
    stats_key_counts = defaultdict(int)
    ngram_index = create_ngram_index(
        light_scenarios=light_scenarios, n_values=[n], stats_key_counts=stats_key_counts
    )
    print(f"Finished creating indexes ({time.time()-start}s)")

    r = re.compile(rf"[\s{re.escape(punctuation)}]+")
    stats_key_to_input_ids = defaultdict(set)
    stats_key_to_reference_ids = defaultdict(set)
    print("computing overlap")
    start = time.time()

    domain.init_progress(input_files[0].file_size)

    for input_file in input_files:
        for i, (bytes_read, line) in enumerate(input_file.iter_lines(progress=True)):
            if i % 1000 == 0:
                print(
                    f"computing overlap progress: {(bytes_read / input_file.file_size) * 100:.2f}%"
                )
                domain.set_progress(bytes_read)
            if i == 10000:
                break
            document = json.loads(line)["text"]
            document_tokens = r.split(document.lower())
            for n in ngram_index.keys():
                for document_ngram in ngrams(document_tokens, n):
                    if document_ngram in ngram_index[n]:
                        for entry_overlap_key in ngram_index[n][document_ngram]:
                            stats_key, id, part = entry_overlap_key.split("+")
                            if part == "input":
                                stats_key_to_input_ids[stats_key].add(id)
                            elif part == "references":
                                stats_key_to_reference_ids[stats_key].add(id)
    print(f"Finished computing overlap ({time.time()-start}s)")
    print("done")

    return stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts

In [24]:
client.code.submit(compute_document_data_overlap)

In [25]:
@sy.syft_function(worker_pool_name=worker_pool_name)
def aggregate(batch_results):
    # stdlib
    from collections import defaultdict

    print("Starting aggregation")
    stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts = zip(
        *batch_results
    )

    total_input_ids = defaultdict(set)
    total_reference_ids = defaultdict(set)
    total_stats_key_counts = defaultdict(int)

    for d in stats_key_counts:
        for key, val in d.items():
            total_stats_key_counts[key] += val

    for d in stats_key_to_input_ids:
        for key in d:
            new_set = set()
            if key in total_input_ids:
                new_set = total_input_ids[key]
            new_set = new_set.union(d[key])
            total_input_ids[key] = new_set

    for d in stats_key_to_reference_ids:
        for key in d:
            new_set = set()
            if key in total_reference_ids:
                new_set = total_reference_ids[key]
            new_set = total_reference_ids[key].union(d[key])
            total_reference_ids[key] = new_set

    all_data_overlap_stats = []
    for stats_key, count in total_stats_key_counts.items():
        data_overlap_stats = {
            "data_overlap_stats_key": None,
            "num_instances": count,
            "instance_ids_with_overlapping_input": sorted(total_input_ids[stats_key]),
            "instance_ids_with_overlapping_reference": sorted(
                total_reference_ids[stats_key]
            ),
        }
        subject, split, n_str = stats_key.rsplit("_", 2)
        data_overlap_stats["data_overlap_stats_key"] = {
            "light_scenario_key": {"scenario_spec": subject, "split": split},
            "overlap_protocol_spec": {"n": int(n_str)},
        }
        all_data_overlap_stats.append(data_overlap_stats)
    print("Finished aggregation")
    return all_data_overlap_stats

In [26]:
client.code.submit(aggregate)

In [27]:
@sy.syft_function_single_use(
    input_files=helm_train_files,
    scenario_files=helm_test_files,
    worker_pool_name=worker_pool_name,
)
def main_function(domain, input_files, scenario_files):
    N = [5, 9, 13]
    batch_results = []
    for n in N:
        for scenario_file in scenario_files:
            batch_job = domain.launch_job(
                compute_document_data_overlap,
                scenario_file=scenario_file,
                input_files=input_files,
                n=n,
            )
            batch_results.append(batch_job.result)

    aggregate_job = domain.launch_job(aggregate, batch_results=batch_results)
    print("Finished main function")
    return aggregate_job.result

In [28]:
client.code.request_code_execution(main_function)

In [37]:
client.requests[0]

In [96]:
client.requests[0].approve(approve_nested=True)

Would you like to proceed? [y/n]: y
Approving request for domain test


In [59]:
job = client.code.main_function(
    input_files=helm_train_files, scenario_files=helm_test_files, blocking=False
)

# Inspect Jobs and get results

In [45]:
job

```python
class Job:
    id: UID = bcfb518eaae24d00b7aecbe940e1e5a9
    status: completed
    has_parent: False
    result: syft.service.action.action_data_empty.ActionDataLink
    logs:

0 Finished main function
JOB COMPLETED
    
```

In [61]:
job.subjobs

In [63]:
job.logs()

Finished main function




# Print result

In [66]:
res = job.result.wait().get()

In [67]:
# stdlib
from pprint import pprint

pprint(res)

[{'data_overlap_stats_key': {'light_scenario_key': {'scenario_spec': "{'class_name': "
                                                                     "'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', "
                                                                     "'args': "
                                                                     "{'subject': "
                                                                     "'philosophy'}}",
                                                    'split': 'train'},
                             'overlap_protocol_spec': {'n': 5}},
  'instance_ids_with_overlapping_input': [],
  'instance_ids_with_overlapping_reference': [],
  'num_instances': 5},
 {'data_overlap_stats_key': {'light_scenario_key': {'scenario_spec': "{'class_name': "
                                                                     "'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', "
                                                                    