# **1) Setup**

In [1]:
from pathlib import Path
import os, sys, platform, json, subprocess

In [3]:
ROOT = Path.cwd().resolve().parents[0]

if (ROOT / "src").exists():
    sys.path.insert(0, str(ROOT))

print("Project root:", ROOT)
print("Python:", sys.version)
print("OS:", platform.platform())

Project root: D:\IIT BBS\Job Resources\fiolabs\diagram-intel
Python: 3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]
OS: Windows-10-10.0.26100-SP0


In [4]:
# .env and configs reachability
print(".env exists:", (ROOT/".env").exists(), " | .env.example:", (ROOT/".env.example").exists())
for f in ["configs/paths.yaml","configs/base.yaml","configs/pipeline.yaml","configs/models.yaml"]:
    p = ROOT / f
    print(f"{f} exists:", p.exists())

.env exists: False  | .env.example: True
configs/paths.yaml exists: True
configs/base.yaml exists: True
configs/pipeline.yaml exists: True
configs/models.yaml exists: True


In [5]:
# Poppler (pdftocairo) check
import subprocess, shutil

try:
    out = subprocess.run(["pdftocairo", "-v"], capture_output=True, text=True)
    msg = (out.stderr or out.stdout or "").splitlines()
    first_line = msg[0] if msg else "no output"
    print("pdftocairo OK:", out.returncode==0, "|", first_line)
except FileNotFoundError:
    print("pdftocairo NOT FOUND. If using conda: conda install -c conda-forge poppler")


pdftocairo OK: False | no output


In [6]:
# Torch/Transformers + CUDA
import torch, transformers
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device 0:", torch.cuda.get_device_name(0))


  from .autonotebook import tqdm as notebook_tqdm


Torch: 2.8.0+cpu
Transformers: 4.55.4
CUDA available: False


In [7]:
# Merged config + registry
from src.config.loader import load_cfg
from omegaconf import OmegaConf
cfg = load_cfg()
print(OmegaConf.to_yaml(cfg)[:500], "...\n")

reg_path = ROOT / "models" / "registry.json"
print("registry.json exists:", reg_path.exists())
print(json.dumps(json.loads(reg_path.read_text()), indent=2)[:800], "...")


paths:
  data_root: ./data
  input_pdfs: ${paths.data_root}/input_pdfs
  raw: ${paths.data_root}/raw
  interim: ${paths.data_root}/interim
  processed: ${paths.data_root}/processed
  exports: ${paths.data_root}/exports
  model_cache: ./models/cache
runtime:
  device: ${env:DEVICE, "cpu"}
  precision: ${env:PRECISION, "float16"}
  workers: 4
  seed: 42
  dpi: 900
  tile:
    micro_size: 512
    meso_size: 1024
    macro_size: 2048
    overlap: 0.15
logging:
  level: ${env:LOG_LEVEL, "INFO"}
phase ...

registry.json exists: True
{
  "qwen2_vl": {
    "repo_id": "Qwen/Qwen2-VL-7B-Instruct",
    "local_path": "D:\\IIT BBS\\Job Resources\\fiolabs\\diagram-intel\\models\\cache\\Qwen2-VL-7B-Instruct",
    "status": "present"
  },
  "qwen2_vl_2b": {
    "repo_id": "Qwen/Qwen2-VL-2B-Instruct",
    "local_path": "D:\\IIT BBS\\Job Resources\\fiolabs\\diagram-intel\\models\\cache\\Qwen2-VL-2B-Instruct",
    "status": "present"
  },
  "llava_v16_mistral_7b": {
    "repo_id": "llava-hf/llava-v1.6-mi

In [None]:
# Qwen2-VL sanity on CPU (robust "{}" parse from last line)

from transformers import AutoProcessor
import torch, json
from PIL import Image
import numpy as np
from pathlib import Path

def blank_image(w=64,h=64):
    return Image.fromarray(np.ones((h,w,3), dtype=np.uint8)*255)

def qwen_json_echo(local_path: str, max_new_tokens=64):
    device = "cpu"
    dtype  = torch.float32

    processor = AutoProcessor.from_pretrained(local_path, trust_remote_code=True)

    # Prefer new class; else fall back gracefully.
    model = None
    try:
        from transformers import AutoModelForImageTextToText
        model = AutoModelForImageTextToText.from_pretrained(
            local_path, torch_dtype=dtype, device_map=None, trust_remote_code=True
        )
    except Exception:
        try:
            from transformers import AutoModelForVision2Seq
            model = AutoModelForVision2Seq.from_pretrained(
                local_path, torch_dtype=dtype, device_map=None, trust_remote_code=True
            )
        except Exception:
            from transformers import Qwen2VLForConditionalGeneration
            model = Qwen2VLForConditionalGeneration.from_pretrained(
                local_path, torch_dtype=dtype, device_map=None, trust_remote_code=True
            )

    model.eval()

    img = blank_image()
    # Ask for a single-line JSON only.
    prompt = "Return only {} on a single line. Do not add any other text."

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt},
        ],
    }]

    # 1) Get templated text (string)
    text = processor.apply_chat_template(messages, add_generation_prompt=True)

    # 2) Tokenize with images
    inputs = processor(text=[text], images=[img], return_tensors="pt")
    inputs = {k: v.to(device) for k,v in inputs.items()}

    with torch.no_grad():
        out_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)

    out = processor.batch_decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print("Full output tail:\n", out[-200:])

    # Parse the **last** line that looks like JSON
    for line in reversed(out.strip().splitlines()):
        ls = line.strip()
        if ls.startswith("{") and ls.endswith("}"):
            try:
                return json.loads(ls)
            except json.JSONDecodeError:
                pass

    # Fallback: show full output for inspection
    print("Could not find a clean JSON line. Full output:\n", out)
    return {}


In [13]:
# Run on 2B (lighter on CPU)
import json
reg = json.loads((ROOT/"models/registry.json").read_text())
qwen2b_path = str(Path(reg["qwen2_vl_2b"]["local_path"]))
print("Testing Qwen2-VL-2B @", qwen2b_path)
print(qwen_json_echo(qwen2b_path))

Testing Qwen2-VL-2B @ D:\IIT BBS\Job Resources\fiolabs\diagram-intel\models\cache\Qwen2-VL-2B-Instruct


Loading checkpoint shards: 100%|██████████| 2/2 [00:40<00:00, 20.16s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Full output tail:
 system
You are a helpful assistant.
user
Return only {} on a single line. Do not add any other text.
assistant
{}
{}
