# Getting Started 
with the NDN Distributed Processing Engine

Note: you may want to run this in a venv or Conda environment.

### Get Dependencies

In [1]:
# Get submodules
!git submodule update --init --recursive

Note: Installing local packages should automatically install pip dependencies such as `python-ndn`. 

In [1]:
# Install packages
!bash -c 'cd ..; for dir in ./pkg/*/; do [ -d "$dir" ] && pip install --find-links=./pkg "$dir"; done'

Looking in links: ./pkg
Processing ./pkg/ndn_compute
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: ndn-compute
  Building wheel for ndn-compute (pyproject.toml) ... [?25ldone
[?25h  Created wheel for ndn-compute: filename=ndn_compute-0.1-py3-none-any.whl size=24835 sha256=4d5d935617194625f3949fb31515f2cc6564869cd2b5df39b383b36acf26d1d3
  Stored in directory: /private/var/folders/x8/vn51j39n4ggf0j7pk0zk6vjh0000gn/T/pip-ephem-wheel-cache-yk39yz7a/wheels/a5/b5/9c/2f295dc3901a3c993c666f8c266e58a95f2800ba25612473db
Successfully built ndn-compute
Installing collected packages: ndn-compute
  Attempting uninstall: ndn-compute
    Found existing installation: ndn-compute 0.1
    Uninstalling ndn-compute-0.1:
      Successfully uninstalled ndn-compute-0.1
Successfully installed ndn-compute-0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m

### Security Setup

In [7]:
import os
import shutil
from ndn_compute_key_creator import create_keys

if any(not os.path.isdir(f'../sec_data/{dir}') for dir in ['certs', 'driver', 'worker']):
    if os.path.isdir('../sec_data'):
        shutil.rmtree('../sec_data')
        
    create_keys('../sec_data/')

### Generate Data

In [8]:
from ndn_compute_jsonl_generator import generate_large_jsonl
from ndn_compute_fs_creator import create_fs_from_directory

In [9]:
# Generate flat files
!mkdir -p ../generated_data/flat/appA
!mkdir -p ../generated_data/flat/appB

generate_large_jsonl(filename='../generated_data/flat/appA/events.log.jsonl', target_size_mb=200)
generate_large_jsonl(filename='../generated_data/flat/appB/events.log.jsonl', target_size_mb=500)

Generating JSONL file of approximately 200MB...
Progress: 22.76% complete
Records written: 100,000
Current file size: 45.51MB
Progress: 45.51% complete
Records written: 200,000
Current file size: 91.03MB
Progress: 68.27% complete
Records written: 300,000
Current file size: 136.55MB
Progress: 91.03% complete
Records written: 400,000
Current file size: 182.06MB

File generation complete!
Final file size: 200.00MB
Total records written: 439,417
Generating JSONL file of approximately 500MB...
Progress: 9.10% complete
Records written: 100,000
Current file size: 45.51MB
Progress: 18.21% complete
Records written: 200,000
Current file size: 91.03MB
Progress: 27.31% complete
Records written: 300,000
Current file size: 136.55MB
Progress: 36.41% complete
Records written: 400,000
Current file size: 182.06MB
Progress: 45.52% complete
Records written: 500,000
Current file size: 227.58MB
Progress: 54.62% complete
Records written: 600,000
Current file size: 273.10MB
Progress: 63.72% complete
Records w

In [10]:
# Distribute files into a toy distributed filesystem

!mkdir -p ../generated_data/distributed
create_fs_from_directory(in_dir="../generated_data/flat",
                         out_dir="../generated_data/distributed",
                         num_partitions=2,
                         num_copies=1,
                         chunk_size=64
                         )

../generated_data/flat/appB/events.log.jsonl
../generated_data/flat/appA/events.log.jsonl


### Starting the cluster

Please run `docker-compose up` in another terminal (from the `ndn-compute` repository root) so that you can see the stdout output in the foreground.

In [2]:
# !docker-compose up

# Make sure your cluster is running
!docker-compose ps

NAME      IMAGE                 COMMAND                  SERVICE   CREATED             STATUS         PORTS
driver1   ndn-compute-driver    "python -m ndn_compu…"   driver    3 minutes ago       Up 3 minutes   0.0.0.0:5214->5214/tcp, :::5214->5214/tcp
nfd1      ndn-compute-nfd       "/usr/bin/nfd --conf…"   nfd       About an hour ago   Up 3 minutes   6363/tcp, 9696/tcp, 6363/udp
worker1   ndn-compute-worker1   "python -m ndn_compu…"   worker1   3 minutes ago       Up 3 minutes   
worker2   ndn-compute-worker2   "python -m ndn_compu…"   worker2   3 minutes ago       Up 3 minutes   


IMPORTANT: You should see a driver, NFD, and worker(s) up

### Using the engine

In [1]:
from ndn_compute_client import NdnComputeClient

In [2]:
client = NdnComputeClient('http://localhost:5214')

#### Component Sanity Checks

In [5]:
# Trivial test to make sure driver and worker are talking to each other
client.add(8, 9)

17

In [17]:
# Test to make sure result store and large transfers are working
import zlib
random_bytes = client.urandom() # This may take a while
print(zlib.crc32(random_bytes)) # Does it match what was computed worker-side? (See docker logs)

3354918877


#### Run Distributed Computations on Example dataset

In [3]:
# Create dataset from remote file
dataset = client.create_dataset("appB/events.log.jsonl")

In [4]:
# Do a transformation, like filter
pred = lambda row: row['event_type'] == 'purchase' and row['device'] == 'tablet' and row['browser'] == 'safari'
ipad_purchases = dataset.filter(pred).collect()

In [5]:
ipad_purchases.head()

Unnamed: 0,id,timestamp,user_id,event_type,device,browser,location,session_duration,metadata
0,SJ4pOPr6MJIjPbVW,2024-04-30 16:38:42.701,8IgVthq7,purchase,tablet,safari,"{'country': 'JP', 'city': 'Tokyo', 'latitude':...",3297,"{'platform_version': '8.7.4', 'user_agent': 'J..."
1,tryk1tLK1L0krcSG,2024-12-19 16:38:42.703,iTl4HFp0,purchase,tablet,safari,"{'country': 'JP', 'city': 'New York', 'latitud...",326,"{'platform_version': '1.8.2', 'user_agent': 'b..."
2,WWwDFPubXDrB8gQ7,2024-10-17 16:38:42.703,thgUyh9g,purchase,tablet,safari,"{'country': 'JP', 'city': 'Paris', 'latitude':...",2179,"{'platform_version': '10.4.4', 'user_agent': '..."
3,gBJ3FgDRMtdCwLRR,2024-07-24 16:38:42.706,DcjXeWME,purchase,tablet,safari,"{'country': 'FR', 'city': 'Sydney', 'latitude'...",361,"{'platform_version': '4.2.4', 'user_agent': 'l..."
4,DnQxtLPtzf9t4aPq,2024-03-08 16:38:42.706,bEdrEKEC,purchase,tablet,safari,"{'country': 'FR', 'city': 'London', 'latitude'...",3019,"{'platform_version': '5.9.6', 'user_agent': 'l..."


In [6]:
# Stress test:
dataset = client.create_dataset("appA/events.log.jsonl")
# Transform data
id_lens = dataset.map(lambda r: len(str(r)))
id_lens_cached = id_lens.cache() # Cache it to materialize results

# Transform data again
id_lens_plus_one = id_lens_cached.map(lambda r: r + 1)
id_lens_plus_one_cached = id_lens_plus_one.cache() # Cache it to materialize results, but using previous cache

In [7]:
df = id_lens_plus_one_cached.collect()

In [8]:
df.shape

(439417, 9)

In [9]:
df.head()

Unnamed: 0,id,timestamp,user_id,event_type,device,browser,location,session_duration,metadata
0,17,27,9,9,7,7,99,4,166
1,17,27,9,9,8,8,101,4,166
2,17,27,9,7,7,8,100,4,166
3,17,27,9,10,7,5,99,5,167
4,17,27,9,6,7,8,100,4,166
