# Chunk Size Microbenchmark

In [1]:
# Move one directory up, so we are in the repository root
import os

if not 'CHDIR_EXECUTED' in globals():  # Keep chdir idempotent, don't keep going up
    os.chdir('..')
    CHDIR_EXECUTED = True
    
!basename "$(pwd)"

ndn-compute


In [2]:
import time
from functools import wraps
from typing import Callable, TypeVar, Any

T = TypeVar('T')

def time_function(func: Callable[..., T]) -> Callable[..., tuple[T, float]]:
    @wraps(func)
    def wrapper(*args, **kwargs) -> tuple[T, float]:
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        
        print(f"{func.__name__} executed in {end_time - start_time:.6f} seconds")
        return result, (end_time - start_time)
    
    return wrapper

import contextlib
import sys

class DummyFile(object):
    def write(self, x): pass

# Credit: https://stackoverflow.com/a/2829036
@contextlib.contextmanager
def no_stdout():
    save_stdout = sys.stdout
    sys.stdout = DummyFile()
    yield
    sys.stdout = save_stdout

In [3]:
from ndn_compute_fs_creator import create_fs_from_directory
from ndn_compute_cluster_manager import run_ndn_compute_cluster, stop_ndn_compute_cluster, restart_ndn_compute_cluster
from ndn_compute_client import NdnComputeClient

In [4]:
@time_function
def do_compute(client):
    dataset = client.create_dataset("appA/events.log.jsonl")
    pred = lambda row: row['event_type'] == 'purchase' and row['device'] == 'tablet' and row['browser'] == 'safari'
    dataset.filter(pred).collect()

In [5]:
# initial 
!rm -rf generated_data/distributed
!mkdir -p generated_data/distributed
create_fs_from_directory(in_dir="generated_data/flat",
                         out_dir="generated_data/distributed",
                         num_partitions=16,
                         num_copies=2,
                         chunk_size=64
                         )

with no_stdout():
        run_ndn_compute_cluster(num_workers=16, rebuild=False)

generated_data/flat/appA/events.log.jsonl


In [6]:
compute_times = list()

for chunk_size in range(10, 110, 10):
    !rm -rf generated_data/distributed
    !mkdir -p generated_data/distributed
    create_fs_from_directory(in_dir="generated_data/flat",
                             out_dir="generated_data/distributed",
                             num_partitions=16,
                             num_copies=2,
                             chunk_size=chunk_size
                             )
    
    restart_ndn_compute_cluster()
    
    client = NdnComputeClient('http://localhost:5214')
    result = do_compute(client)
    compute_times.append(result[1])

generated_data/flat/appA/events.log.jsonl
Restarting NDN compute cluster...
Restarting NFD container...
Restarting driver container...
Error restarting driver container: 500 Server Error for http+docker://localhost/v1.48/containers/755245ceea6adee5d46b72a0f4a41754a55a4977c373c35bef8c1baf309e2171/restart?t=1: Internal Server Error ("Cannot restart container 755245ceea6adee5d46b72a0f4a41754a55a4977c373c35bef8c1baf309e2171: error while creating mount source path '/host_mnt/Users/jacob/Documents/2025 Winter/COM SCI 214/ndn-compute/generated_data/distributed/manifest': mkdir /host_mnt/Users/jacob/Documents/2025 Winter/COM SCI 214/ndn-compute/generated_data/distributed/manifest: no such file or directory")
Restarting worker1 container...
Restarting worker2 container...
Restarting worker3 container...
Restarting worker4 container...
Restarting worker5 container...
Restarting worker6 container...
Restarting worker7 container...
Restarting worker8 container...
Restarting worker9 container...
Re

In [7]:
compute_times

[3.3878188133239746,
 2.7990610599517822,
 3.530658006668091,
 4.5014448165893555,
 5.433221101760864,
 4.67586088180542,
 4.7270917892456055,
 4.843189001083374,
 4.905077934265137,
 9.491292953491211]