# Orchestrator Notebook
This notebook runs all the pipeline processes in sequence.



## Project root setup

This section ensures the notebook runs modules like `python -m src...` by setting CWD and sys.path.

In [1]:
import os, sys, pathlib

# Compute the project root by going one level up from the notebook's folder
ROOT = pathlib.Path(os.getcwd()).resolve().parents[0]  # assumes notebook is in /src/

# Change current working directory to the project root
os.chdir(ROOT)

# Add the project root to sys.path (so 'import src.*' works)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("CWD:", os.getcwd())
print("sys.executable:", sys.executable)




CWD: /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project
sys.executable: /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.13


## Configuration and modules definition

Defines each module in order to be executed. The order is the following:
1. MinIO and ChromaDB managers
2. Data ingestion
3. Landing zone
4. Formatted zone
5. Trusted zone
6. Exploitation zone

In [2]:
# --- Configuration ---
from pathlib import Path
import os


MODULES = [
    # --------- Managers ---------
    {
        "name": "MinIO manager",
        "module": "src.common.minio_manager",  
        "args": []
    },
    {
        "name": "ChromaDB manager",
        "module": "src.common.chroma_client", 
        "args": []
    },

    # --------- Data ingestion ---------
    {
        "name": "Text ingestion",
        "module": "src.data_management.data_ingestion.text_ingestion",
        "args": []
    },
    {
        "name": "Image ingestion",
        "module": "src.data_management.data_ingestion.image_ingestion",
        "args": []
    },
    {
        "name": "Video ingestion",
        "module": "src.data_management.data_ingestion.video_ingestion",
        "args": []
    },

    # --------- Landing zone --------- 
    {
        "name": "Landing zone",
        "module": "src.data_management.landing_zone.landing_zone",
        "args": []
    },


    # # --------- Formatted zone --------- 
    # {
    #     "name": "Formatted text",
    #     "module": "src.data_management.formatted_zone.formatted_text",
    #     "args": []
    # },
    # {
    #     "name": "Formatted images",
    #     "module": "src.data_management.formatted_zone.formatted_images",
    #     "args": []
    # },
    # {
    #     "name": "Formatted videos",
    #     "module": "src.data_management.formatted_zone.formatted_videos",
    #     "args": []
    # },


    # # --------- Trusted zone --------- 
    # {
    #     "name": "Trusted text",
    #     "module": "src.data_management.trusted_zone.trusted_text",
    #     "args": []
    # },
    # {
    #     "name": "Trusted images",
    #     "module": "src.data_management.trusted_zone.trusted_images",
    #     "args": []
    # }    
]


## Helpers to run modules

In [3]:
# --- Helpers to run modules ---
import subprocess, sys, shlex, time

def run_module(module: str, args=None, env=None, check=True, timeout=None, cwd=None):
    """
    Run a Python module with `python -m <module> [args...]` and stream output
    """

    if args is None: args = []
    cmd = [sys.executable, "-u", "-m", module, *map(str, args)]
    print("\n$", " ".join(shlex.quote(c) for c in cmd)) # shows command to be executed in console

    env2 = os.environ.copy()
    if env: env2.update(env)

    start = time.time()
    rc = subprocess.run(cmd, env=env2, cwd=cwd, check=False, timeout=timeout).returncode
    if check and rc != 0:
        raise subprocess.CalledProcessError(rc, cmd)
    print(f"[done in {time.time()-start:.1f}s]")
    return rc
    # Stream to notebook and optional log file
    # with (open(log_file, "a", encoding="utf-8") if log_file else open(os.devnull, "w")) as lf:
    #     for line in proc.stdout:
    #         print(line, end="")
    #         if log_file:
    #             lf.write(line)
    #     ret = proc.wait(timeout=timeout)
    #     duration = time.time() - start
    #     print(f"\n[INFO] Module finished with code {ret} in {duration:.1f}s")
    #     if log_file:
    #         lf.write(f"\n[INFO] Module finished with code {ret} in {duration:.1f}s\n")
    # if check and ret != 0:
    #     raise RuntimeError(f"Module {module} exited with code {ret}")
    # return ret


In [4]:
# --- Run pipeline ---
from datetime import datetime

print("=== Pipeline start:", datetime.now().isoformat(), "===")

for step in MODULES:
    print(f"\n=== Step: {step['name']} ===")
    run_module(
        module = step["module"],
        args = step.get("args", []),
        check = True,
        timeout = None
    )
print("\n=== Pipeline completed successfully ===")

=== Pipeline start: 2025-10-14T09:37:26.012960 ===

=== Step: MinIO manager ===

$ /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.13 -u -m src.common.minio_manager
[OK]: landing-zone/temporal_landing/
[OK]: landing-zone/persistent_landing/image_data/
[OK]: landing-zone/persistent_landing/video_data/
[OK]: landing-zone/persistent_landing/text_data/
[OK]: formatted-zone/formatted/image_data/
[OK]: formatted-zone/formatted/video_data/
[OK]: formatted-zone/formatted/text_data/
[OK]: trusted-zone/trusted/image_data/
[OK]: trusted-zone/trusted/video_data/
[OK]: trusted-zone/trusted/text_data/
[OK]: rejected-zone/rejected/image_data/
[OK]: rejected-zone/rejected/video_data/
[OK]: rejected-zone/rejected/text_data/
[done in 0.6s]

=== Step: ChromaDB manager ===

$ /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.13 -u -m src.common.chroma_client
[done in 4.2s]

=== Step: Text ingestion ===

$ /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.1