In [1]:
import syft as sy
import os
from syft import ActionObject
from collections import defaultdict



Start this using

`docker compose --profile blob-storage --file docker-compose.multinode.yml --file docker-compose.dev.yml up`

`hagrid <standard stuff here> --dev`

In [2]:
client = sy.login(url="http://localhost:8080", email="info@openmined.org", password="changethis")

Logged into <test: High side Domain> as <info@openmined.org>


# Mount storage container with Helm azure container

In [3]:
client.api.services.blob_storage.mount_azure(
    account_name='helmprojectstorage',
    container_name='helm',
    account_key=os.environ["HELM_STORAGE_ACCOUNT_KEY"],
    bucket_name='helmazurebucket',
)

In [4]:
blob_files = client.api.services.blob_storage.get_files_from_bucket(bucket_name='helmazurebucket')

In [5]:
blob_files

# Start workers

In [11]:
client.worker.start_workers(n=6) # 46 in total

In [13]:
client.worker.list()

# Create Dataset

In [12]:
# train_files = [f for f in blob_files if "train-" in f.file_name]
train_files = [f for f in blob_files if "train-" in f.file_name]
scenario_file = [f for f in blob_files if "scenario_data" in f.file_name][0]

In [14]:
helm_dataset = sy.Dataset(
    name="Helm Dataset",
    asset_list=[
        sy.Asset(
            name="helm train data",
            data=ActionObject.from_obj(train_files),
            mock=sy.ActionObject.empty()
        ),
        sy.Asset(
            name="helm test data",
            data=ActionObject.from_obj(scenario_file),
            mock=sy.ActionObject.empty()
        )
    ]
)

In [15]:
client.upload_dataset(helm_dataset)

Would you like to proceed? [y/n]: y


  0%|                                                                                                                         | 0/2 [00:00<?, ?it/s]

Uploading: helm train data


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.37s/it]


Uploading: helm test data


In [16]:
helm_ds = client.datasets["Helm Dataset"]
helm_train_files = helm_ds.assets["helm train data"]
helm_test_file = helm_ds.assets["helm test data"]

In [17]:
helm_train_files

# Syft functions

In [18]:
@sy.syft_function()
def compute_document_data_overlap(domain, input_file):
    print("starting overlap computation")
    print(input_file.file_name)

    from nltk import ngrams
    from collections import defaultdict
    from string import punctuation
    import re, json
    import time

    print("computing overlap")
    start = time.time()
    
    domain.init_progress(input_file.file_size)

    for i, (bytes_read, line) in enumerate(input_file.iter_lines(progress=True, chunk_size=1024 * 1000 * 50)):
        if i%1000 == 0:
            print(f"computing overlap progress: {(bytes_read / input_file.file_size) * 100:.2f}%")
            domain.set_progress(bytes_read)
    print(f"Finished computing overlap ({time.time()-start}s)")
    print("done")
    
    return None

In [19]:
client.code.submit(compute_document_data_overlap)

In [20]:
@sy.syft_function_single_use(input_files=helm_train_files, scenario_file=helm_test_file)
def main_function(domain, input_files, scenario_file):
    N = [5, 9, 13]
    jobs = []
    for n in N:
        for input_file in input_files[:15]:
            batch_job = domain.launch_job(
                compute_document_data_overlap,
                input_file=input_file,
            )
            jobs.append(batch_job)
                
    jobs[0].wait()
    for n in N:
        for input_file in input_files[15:]:
            batch_job = domain.launch_job(
                compute_document_data_overlap,
                input_file=input_file,
            )
            jobs.append(batch_job)

#     batch_job = domain.launch_job(
#                 compute_document_data_overlap,
#                 input_file=input_files[0],
#             )
#     jobs.append(batch_job)
                
#     jobs[0].wait()

#     batch_job = domain.launch_job(
#                 compute_document_data_overlap,
#                 input_file=input_files[0],
#             )
#     jobs.append(batch_job)
#     jobs[-1].wait()
#     for n in N:
#         for input_file in input_files[20:]:
#             batch_job = domain.launch_job(
#                 compute_document_data_overlap,
#                 input_file=input_file,
#             )
#             jobs.append(batch_job)
    return None


In [21]:
req = client.code.request_code_execution(main_function)
req

In [22]:
req = client.requests[0]
req

In [23]:
req.approve(approve_nested=True)

Would you like to proceed? [y/n]: y
Request approved for domain test


In [24]:
client.code

In [25]:
job = client.code.main_function(input_files=helm_train_files, scenario_file=helm_test_file, blocking=False)

In [5]:
job = client.jobs[79]

# Inspect Jobs and get results

In [26]:
job

```python
class Job:
    id: UID = e7e35e1f3aa54e93a64a220e708453b7
    status: JobStatus.PROCESSING
    has_parent: False
    result: ActionDataEmpty <None>
    logs:

0 
    
```

In [22]:
job.subjobs

In [21]:
lines_to_log = {}
for j in job.subjobs:
    if j.status.value not in ['processing', 'completed']:
        continue
    lines = j.logs(_print=False).split('\n')
    if lines[1] not in lines_to_log:
        lines_to_log[lines[1]] = []
    lines_to_log[lines[1]].append(lines[-3].split(':')[-1])
    

batch = {}
    
# for file in helm_train_files:
#     print(file.name)
    
list_lines = [key+ ':' + ' '.join([x for x in lines_to_log[key]]) for key in lines_to_log]
list_lines.sort(key=lambda x: x[:9])
print("\n".join(list_lines))
old_lines_to_log = lines_to_log

train-00.jsonl:done done done
train-01.jsonl:done done done
train-02.jsonl:done done done
train-03.jsonl:done done done
train-04.jsonl:done done done
train-05.jsonl:done done done
train-06.jsonl:done done done
train-07.jsonl:done done done
train-08.jsonl:done done done
train-09.jsonl:done done done
train-10.jsonl:done done done
train-11.jsonl:done done done
train-12.jsonl:done done done
train-13.jsonl:done done done
train-14.jsonl:done done done
train-15.jsonl:done done done
train-16.jsonl:done done done
train-17.jsonl:done done done
train-18.jsonl:done done done
train-19.jsonl:done done done
train-20.jsonl:done done done
train-21.jsonl:done done done
train-22.jsonl:done done done
train-23.jsonl:done done done
train-24.jsonl:done done done
train-25.jsonl:done done done
train-26.jsonl:done done done
train-27.jsonl:done done done
train-28.jsonl:done done done
train-29.jsonl:done done done


In [None]:
lines_to_log = {}
for j in job.subjobs:
    lines = j.logs(_print=False).split('\n')
    if lines[1] not in lines_to_log:
        lines_to_log[lines[1]] = ''
    lines_to_log[lines[1]] = lines_to_log[lines[1]] + lines[-3].split(':')[-1] + ","
#     lines_to_log.append(lines[-3])
    
list_lines = [key+ ':' + lines_to_log[key] for key in lines_to_log]
list_lines.sort(key=lambda x: x[:9])
print("\n".join(list_lines))
    