# Getting Started 
with the NDN Distributed Processing Engine

Note: you may want to run this in a venv or Conda environment.

### Get Dependencies

In [1]:
# Move one directory up, so we are in the repository root
import os

if not 'CHDIR_EXECUTED' in globals():  # Keep chdir idempotent, don't keep going up
    os.chdir('..')
    CHDIR_EXECUTED = True
    
!basename "$(pwd)"

ndn-compute


In [2]:
# Get submodules
!git submodule update --init --recursive

Note: Installing local packages should automatically install pip dependencies such as `python-ndn`. 

In [3]:
# Install packages
!bash -c 'for dir in ./pkg/*/; do [ -d "$dir" ] && pip install --find-links=./pkg "$dir"; done'

Looking in links: ./pkg
Processing ./pkg/ndn_compute
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: ndn-compute
  Building wheel for ndn-compute (setup.py) ... [?25ldone
[?25h  Created wheel for ndn-compute: filename=ndn_compute-0.1-py3-none-any.whl size=25463 sha256=13f54c93bca3f5b29f5ffdc55808e4a5893b6620f5fc3d0130af9897f26ed90a
  Stored in directory: /private/var/folders/6j/882zygc542d8hw9xtj9w62c80000gn/T/pip-ephem-wheel-cache-qg6eqwf7/wheels/8a/37/88/91a343e9492df6e624dec040fe83576d55a129326ed5056476
Successfully built ndn-compute
Installing collected packages: ndn-compute
  Attempting uninstall: ndn-compute
    Found existing installation: ndn-compute 0.1
    Not uninstalling ndn-compute at /Users/jacob/Documents/2025 Winter/COM SCI 214/ndn-compute/pkg/ndn_compute, outside environment /Users/jacob/Documents/2025 Winter/COM SCI 214/ndn-compute/venv
    Can't uninstall 'ndn-compute'. No files were found to uninstall.
Successful

### Cluster Configuration

In [4]:
NUM_WORKERS = 3
REPLICATION_FACTOR = 2

### Security Setup

In [5]:
import os
import shutil
from ndn_compute_key_creator import create_keys

if any(not os.path.isdir(f'sec_data/{dir}') for dir in ['certs', 'driver', 'worker']):
    if os.path.isdir('sec_data'):
        shutil.rmtree('sec_data')
        
    create_keys('sec_data/')

### Generate Data

In [6]:
from ndn_compute_jsonl_generator import generate_large_jsonl
from ndn_compute_fs_creator import create_fs_from_directory

In [7]:
!rm -rf generated_data

# Generate flat files
!mkdir -p generated_data/flat/appA
!mkdir -p generated_data/flat/appB

generate_large_jsonl(filename='generated_data/flat/appA/events.log.jsonl', target_size_mb=200)
generate_large_jsonl(filename='generated_data/flat/appB/events.log.jsonl', target_size_mb=500)

Generating JSONL file of approximately 200MB...
Progress: 22.76% complete
Records written: 100,000
Current file size: 45.51MB
Progress: 45.51% complete
Records written: 200,000
Current file size: 91.02MB
Progress: 68.27% complete
Records written: 300,000
Current file size: 136.54MB
Progress: 91.03% complete
Records written: 400,000
Current file size: 182.06MB

File generation complete!
Final file size: 200.00MB
Total records written: 439,417
Generating JSONL file of approximately 500MB...
Progress: 9.10% complete
Records written: 100,000
Current file size: 45.51MB
Progress: 18.21% complete
Records written: 200,000
Current file size: 91.03MB
Progress: 27.31% complete
Records written: 300,000
Current file size: 136.55MB
Progress: 36.41% complete
Records written: 400,000
Current file size: 182.06MB
Progress: 45.51% complete
Records written: 500,000
Current file size: 227.57MB
Progress: 54.62% complete
Records written: 600,000
Current file size: 273.10MB
Progress: 63.72% complete
Records w

In [8]:
# Distribute files into a toy distributed filesystem

!mkdir -p generated_data/distributed
create_fs_from_directory(in_dir="generated_data/flat",
                         out_dir="generated_data/distributed",
                         num_partitions=NUM_WORKERS,
                         num_copies=REPLICATION_FACTOR,
                         chunk_size=64
                         )

generated_data/flat/appB/events.log.jsonl
generated_data/flat/appA/events.log.jsonl


### Starting the cluster

In [9]:
from ndn_compute_cluster_manager import run_ndn_compute_cluster, stop_ndn_compute_cluster
run_ndn_compute_cluster(num_workers=NUM_WORKERS, rebuild=True)

import time
cluster_start_time = int(time.time())

Building images...
Building driver image: ndn-compute/driver:latest
Building worker image: ndn-compute/worker:latest
Creating network...
Network ndn_compute_net already exists
Starting containers...
Starting NFD container...
Found existing NFD container, removing...
NFD container started with ID: 8cae49c76195b50b7d745f48dee52ef7b51de1e9645b2aeeeb594c1302679806
Starting driver container...
Found existing driver container, removing...
Driver container started with ID: bafb75f847389288c08f6c48994bb0a2d9c23c664fd535db0ca7711a144a6cbd
Starting worker container: worker1 with IP 192.168.1.20...
Found existing worker1 container, removing...
Worker container worker1 started with ID: 2201cb3a5aa8ebc904e69076b68355824f3cfa250af5e953f584a85bf7bf52e1
Starting worker container: worker2 with IP 192.168.1.21...
Found existing worker2 container, removing...
Worker container worker2 started with ID: f0ef1c8c65df910669c5120789cb8462e99686f383e6fdf10bd1b6f75bf30367
Starting worker container: worker3 with 

In [None]:
# The containers need time to start up and stabilize. We give it 10 seconds.
ready_time = cluster_start_time + 10
current_time = int(time.time())
if current_time < ready_time:
    time.sleep(ready_time - current_time)

In [10]:
# Make sure your cluster is running
!docker ps

CONTAINER ID   IMAGE                       COMMAND                  CREATED         STATUS                  PORTS                          NAMES
f35a91ba9bfc   ndn-compute/worker:latest   "python -m ndn_compu…"   1 second ago    Up Less than a second                                  worker3
f0ef1c8c65df   ndn-compute/worker:latest   "python -m ndn_compu…"   1 second ago    Up Less than a second                                  worker2
2201cb3a5aa8   ndn-compute/worker:latest   "python -m ndn_compu…"   1 second ago    Up 1 second                                            worker1
bafb75f84738   ndn-compute/driver:latest   "python -m ndn_compu…"   2 seconds ago   Up 1 second             0.0.0.0:5214->5214/tcp         driver1
8cae49c76195   ndn-compute/nfd:latest      "/usr/bin/nfd --conf…"   2 seconds ago   Up 1 second             6363/tcp, 9696/tcp, 6363/udp   nfd1


IMPORTANT: You should see a driver, NFD, and worker(s) up

### Using the engine

In [11]:
from ndn_compute_client import NdnComputeClient

In [13]:
client = NdnComputeClient('http://localhost:5214')

#### Component Sanity Checks

In [14]:
# Trivial test to make sure driver and worker are talking to each other
client.add(8, 9)

17

In [15]:
# Test to make sure result store and large transfers are working
import zlib
random_bytes = client.urandom() # This may take a while
print(zlib.crc32(random_bytes)) # Does it match what was computed worker-side? (See docker logs)

3256280040


#### Run Distributed Computations on Example dataset

In [16]:
# Create dataset from remote file
dataset = client.create_dataset("appB/events.log.jsonl")

In [17]:
# Do a transformation, like filter
pred = lambda row: row['event_type'] == 'purchase' and row['device'] == 'tablet' and row['browser'] == 'safari'
ipad_purchases = dataset.filter(pred).collect()

In [18]:
ipad_purchases.head()

Unnamed: 0,id,timestamp,user_id,event_type,device,browser,location,session_duration,metadata
0,8z87q3HkhxAL4hrK,2024-05-01 11:29:19.802,nTDwhXs2,purchase,tablet,safari,"{'country': 'FR', 'city': 'Sydney', 'latitude'...",927,"{'platform_version': '3.2.5', 'user_agent': 'h..."
1,y1KqAlvaK6MVlHIh,2024-08-20 11:29:19.802,5JSSq8DX,purchase,tablet,safari,"{'country': 'JP', 'city': 'Sydney', 'latitude'...",1432,"{'platform_version': '10.5.5', 'user_agent': '..."
2,uMkTgxEKx2chFWLO,2024-09-04 11:29:19.804,yVBAZzYy,purchase,tablet,safari,"{'country': 'JP', 'city': 'Tokyo', 'latitude':...",3593,"{'platform_version': '6.8.6', 'user_agent': 'a..."
3,Rlhq2QGC6WyA60Yt,2024-11-29 11:29:19.804,hlOiPyxl,purchase,tablet,safari,"{'country': 'DE', 'city': 'Paris', 'latitude':...",1482,"{'platform_version': '9.1.0', 'user_agent': 'z..."
4,6TwLcnkNdRB6keLL,2024-03-19 11:29:19.805,sSlW7w15,purchase,tablet,safari,"{'country': 'DE', 'city': 'Sydney', 'latitude'...",1356,"{'platform_version': '8.6.7', 'user_agent': 'G..."


In [19]:
# Stress test:
dataset = client.create_dataset("appA/events.log.jsonl")
# Transform data
id_lens = dataset.map(lambda r: len(str(r)))
id_lens_cached = id_lens.cache() # Cache it to materialize results

# Transform data again
id_lens_plus_one = id_lens_cached.map(lambda r: r + 1)
id_lens_plus_one_cached = id_lens_plus_one.cache() # Cache it to materialize results, but using previous cache

In [20]:
df = id_lens_plus_one_cached.collect()

In [21]:
df.shape

(439417, 9)

In [22]:
df.head()

Unnamed: 0,id,timestamp,user_id,event_type,device,browser,location,session_duration,metadata
0,17,27,9,7,8,7,101,5,166
1,17,27,9,7,7,8,100,5,166
2,17,27,9,9,7,8,99,4,166
3,17,27,9,6,7,7,103,5,166
4,17,27,9,6,7,8,102,5,166


### Cleanup

In [23]:
stop_ndn_compute_cluster()

Stopping NDN compute cluster...
Stopping NFD container...
NFD container stopped and removed.
Stopping driver container...
Driver container stopped and removed.
Stopping worker1 container...
worker1 container stopped and removed.
Stopping worker2 container...
worker2 container stopped and removed.
Stopping worker3 container...
worker3 container stopped and removed.
worker4 container not found.
Removing network ndn_compute_net...
Network ndn_compute_net removed.
NDN compute cluster shutdown complete. Stopped 3 workers.
