In [3]:
# --- Project root & sys.path setup (for notebooks inside /src) ---
# Ensure the notebook runs modules like `python -m src...` by setting CWD and sys.path
import os, sys, pathlib

# Compute the project root by going one level up from the notebook's folder
ROOT = pathlib.Path(os.getcwd()).resolve().parents[0]  # assumes notebook is in /src/

# Change current working directory to the project root
os.chdir(ROOT)

# Add the project root to sys.path (so 'import src.*' works)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("CWD:", os.getcwd())
print("sys.executable:", sys.executable)




CWD: /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project
sys.executable: /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.13


# Orchestrator Notebook
This notebook runs all the pipeline processes in sequence.



## Configuration and modules definition

Defines each module in order to be executed. The order is the following:
1. MinIO and ChromaDB managers
2. Data ingestion
3. Landing zone
4. Formatted zone
5. Trusted zone
6. Exploitation zone

In [6]:
# --- Configuration ---
from pathlib import Path
import os


MODULES = [
    # --------- Managers ---------
    {
        "name": "MinIO manager",
        "module": "src.common.minio_manager",  
        "args": []
    },
    {
        "name": "ChromaDB manager",
        "module": "src.common.chroma_client", 
        "args": []
    },

    # # --------- Data ingestion ---------
    # {
    #     "name": "Text ingestion",
    #     "module": "src.data_management.data_ingestion.text_ingestion",
    #     "args": []
    # },
    # {
    #     "name": "Image ingestion",
    #     "module": "src.data_management.data_ingestion.image_ingestion",
    #     "args": []
    # },
    # {
    #     "name": "Video ingestion",
    #     "module": "src.data_management.data_ingestion.video_ingestion",
    #     "args": []
    # },

    # # --------- Landing zone --------- 
    # {
    #     "name": "Landing zone",
    #     "module": "src.data_management.landing_zone.landing_zone",
    #     "args": []
    # },


    # # --------- Formatted zone --------- 
    # {
    #     "name": "Formatted text",
    #     "module": "src.data_management.formatted_zone.formatted_text",
    #     "args": []
    # },
    # {
    #     "name": "Formatted images",
    #     "module": "src.data_management.formatted_zone.formatted_images",
    #     "args": []
    # },
    # {
    #     "name": "Formatted videos",
    #     "module": "src.data_management.formatted_zone.formatted_videos",
    #     "args": []
    # },


    # # --------- Trusted zone --------- 
    # {
    #     "name": "Trusted text",
    #     "module": "src.data_management.trusted_zone.trusted_text",
    #     "args": []
    # },
    # {
    #     "name": "Trusted images",
    #     "module": "src.data_management.trusted_zone.trusted_images",
    #     "args": []
    # }    
]


In [7]:
# --- Helpers to run modules ---
import subprocess, sys, shlex, time

def run_module(module: str, args=None, env=None, log_file=None, check=True, timeout=None):
    """Run a Python module with `python -m <module> [args...]` and stream output."""
    if args is None: args = []
    cmd = [sys.executable, "-m", module, *args]
    print("\n$", " ".join(shlex.quote(c) for c in cmd))
    start = time.time()
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        env=env
    )
    # Stream to notebook and optional log file
    with (open(log_file, "a", encoding="utf-8") if log_file else open(os.devnull, "w")) as lf:
        for line in proc.stdout:
            print(line, end="")
            if log_file:
                lf.write(line)
        ret = proc.wait(timeout=timeout)
        duration = time.time() - start
        print(f"\n[INFO] Module finished with code {ret} in {duration:.1f}s")
        if log_file:
            lf.write(f"\n[INFO] Module finished with code {ret} in {duration:.1f}s\n")
    if check and ret != 0:
        raise RuntimeError(f"Module {module} exited with code {ret}")
    return ret


In [10]:
# --- Run pipeline ---
from datetime import datetime

print("=== Pipeline start:", datetime.now().isoformat(), "===")

for step in MODULES:
    print(f"\n=== Step: {step['name']} ===")
    run_module(
        step["module"],
        args=step.get("args", []),
        log_file=str(None),  # set to None to disable file logging
        check=True,
        timeout=None
    )
print("\n=== Pipeline completed successfully ===")

=== Pipeline start: 2025-10-12T01:22:19.196582 ===

=== Step: MinIO manager ===

$ /home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.13 -m src.common.minio_manager
/home/paloli/Documentos/MDS/ADSDB/ADSDB-Project/.venv/bin/python3.13: Error while finding module specification for 'src.common.minio_manager' (ModuleNotFoundError: No module named 'src')

[INFO] Module finished with code 1 in 0.0s


RuntimeError: Module src.common.minio_manager exited with code 1