# Data Availability (Replication) Microbenchmark

In [1]:
# Move one directory up, so we are in the repository root
import os

if not 'CHDIR_EXECUTED' in globals():  # Keep chdir idempotent, don't keep going up
    os.chdir('..')
    CHDIR_EXECUTED = True
    
!basename "$(pwd)"

ndn-compute


In [2]:
import time
from functools import wraps
from typing import Callable, TypeVar, Any

T = TypeVar('T')

def time_function(func: Callable[..., T]) -> Callable[..., tuple[T, float]]:
    @wraps(func)
    def wrapper(*args, **kwargs) -> tuple[T, float]:
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        
        print(f"{func.__name__} executed in {end_time - start_time:.6f} seconds")
        return result, (end_time - start_time)
    
    return wrapper

import contextlib
import sys

class DummyFile(object):
    def write(self, x): pass

# Credit: https://stackoverflow.com/a/2829036
@contextlib.contextmanager
def no_stdout():
    save_stdout = sys.stdout
    sys.stdout = DummyFile()
    yield
    sys.stdout = save_stdout

In [3]:
from ndn_compute_fs_creator import create_fs_from_directory
from ndn_compute_cluster_manager import run_ndn_compute_cluster, stop_ndn_compute_cluster
from ndn_compute_client import NdnComputeClient

In [4]:
@time_function
def do_compute(client):
    dataset = client.create_dataset("appA/events.log.jsonl")
    pred = lambda row: row['event_type'] == 'purchase' and row['device'] == 'tablet' and row['browser'] == 'safari'
    dataset.filter(pred).collect()

In [None]:
compute_times = list()

for num_copies in range(1, 8+1):
    !rm -rf generated_data/distributed
    !mkdir -p generated_data/distributed
    create_fs_from_directory(in_dir="generated_data/flat",
                             out_dir="generated_data/distributed",
                             num_partitions=16,
                             num_copies=num_copies,
                             chunk_size=32
                             )
    
    # not sure why docker desktop is freaking out about mounting :(
    print("Restarting Docker...")
    !docker desktop restart > /dev/null
    
    print("Creating containers...")
    with no_stdout():
        run_ndn_compute_cluster(num_workers=16, rebuild=False)
    
    client = NdnComputeClient('http://localhost:5214')
    result = do_compute(client)
    compute_times.append(result[1])

generated_data/flat/appA/events.log.jsonl
Restarting Docker...
Creating containers...
do_compute executed in 4.465856 seconds
generated_data/flat/appA/events.log.jsonl
Restarting Docker...
Creating containers...
do_compute executed in 5.723357 seconds
generated_data/flat/appA/events.log.jsonl
Restarting Docker...
Creating containers...
do_compute executed in 3.514460 seconds
generated_data/flat/appA/events.log.jsonl
Restarting Docker...
Creating containers...
do_compute executed in 3.392266 seconds
generated_data/flat/appA/events.log.jsonl
Restarting Docker...
Creating containers...
