In [8]:
# ✅ Section 1: Install deps & log environment (Colab-safe)
!pip -q uninstall -y numpy torch torchvision sentence-transformers transformers accelerate pillow huggingface-hub langchain-community langchain-core langchain-text-splitters pypdf
!pip -q install -U "numpy==1.26.4" "requests==2.32.4" \
                  "sentence-transformers>=3.0.1" "transformers>=4.44.0" \
                  "accelerate>=0.34.2" "pillow>=10.3.0" \
                  "torch>=2.2.0" "torchvision>=0.17.0" "huggingface-hub>=0.24.0" \
                  "langchain-community>=0.0.33" "langchain-core>=0.1.27" "langchain-text-splitters>=0.0.1" "pypdf>=4.1.0"

import sys, platform, json, os, pathlib, datetime
import numpy as np
import torch, torchvision
import PIL
import sentence_transformers, transformers
import langchain_community, langchain_core, langchain_text_splitters

env = {
    "timestamp": datetime.datetime.now().isoformat(timespec="seconds"),
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "numpy": np.__version__,
    "torch": torch.__version__,
    "torch_cuda": torch.cuda.is_available(),
    "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
    "torchvision": torchvision.__version__,
    "pillow": PIL.__version__,
    "sentence_transformers": sentence_transformers.__version__,
    "transformers": transformers.__version__,
    "langchain_community": langchain_community.__version__,
    "langchain_core": langchain_core.__version__,
    # "langchain_text_splitters": langchain_text_splitters.__version__, # Removed as it causes an AttributeError
}
print(json.dumps(env, indent=2))
with open("env_multimodal.json","w") as f: json.dump(env, f, indent=2)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m{
  "timestamp": "2025-09-26T04:59:33",
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "numpy": "1.26.4",
  "torch": "2.8.0+cu128",
  "torch_cuda": false,
  "device": "CPU",
  "torchvision": "0.23.0+cu128",
  "pillow": "11.3.0",
  "sentence_transformers": "5.1.1",
  "transformers": "4.56.2",
  "

In [3]:
# SECTION 2 — FOLDERS & UPLOADS

from pathlib import Path
IMG_DIR = Path("mm_images"); IMG_DIR.mkdir(exist_ok=True)
TXT_DIR = Path("corpus");    TXT_DIR.mkdir(exist_ok=True)

# Upload images and (optionally) PDFs/TXTs
try:
    from google.colab import files
    print("Upload 5–10 PNG/JPG images (multi-select):")
    up_imgs = files.upload()
    for name, data in (up_imgs or {}).items():
        if name.lower().endswith((".png",".jpg",".jpeg",".webp",".bmp",".tif",".tiff")):
            (IMG_DIR / name).write_bytes(data)

    print("\n(Optional) Upload PDFs/TXTs (or skip if you already have ./corpus):")
    up_txt = files.upload()
    for name, data in (up_txt or {}).items():
        (TXT_DIR / name).write_bytes(data)

    print("Saved images:", [p.name for p in IMG_DIR.iterdir() if p.is_file()])
    print("Saved text files:", [p.name for p in TXT_DIR.iterdir() if p.is_file()])
except Exception as e:
    print("Colab file upload not available:", e)


Upload 5–10 PNG/JPG images (multi-select):


Saving main_table.png to main_table.png
Saving framework.png to framework.png
Saving DecisionTreeClassifier.png to DecisionTreeClassifier.png
Saving Bernoulli.png to Bernoulli.png
Saving AdaBoostClassifier.png to AdaBoostClassifier.png
Saving ablation1.png to ablation1.png
Saving ablation4.png to ablation4.png
Saving ablation2.png to ablation2.png

(Optional) Upload PDFs/TXTs (or skip if you already have ./corpus):


Saving NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf to NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf
Saving NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf to NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf
Saving NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf to NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf
Saved images: ['DecisionTreeClassifier.png', 'ablation4.png', 'Bernoulli.png', 'AdaBoostClassifier.png', 'ablation1.png', 'framework.png', 'main_table.png', 'ablation2.png']
Saved text files: ['NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf', 'NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf

In [9]:
# SECTION 3 — LOAD & CHUNK TEXT

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

docs = []
for p in sorted(TXT_DIR.iterdir()):
    if not p.is_file():
        continue
    ext = p.suffix.lower()
    try:
        if ext == ".pdf":
            docs.extend(PyPDFLoader(str(p)).load())
        elif ext in [".txt",".md",".text"]:
            docs.extend(TextLoader(str(p), encoding="utf-8").load())
    except Exception as e:
        print(f"[WARN] Could not read {p.name}: {e}")

# Fallback tiny sample if no docs
if not docs:
    (TXT_DIR / "sample.txt").write_text(
        "This document references Chart 1 and discusses trend 1. It compares with Chart 2.",
        encoding="utf-8"
    )
    docs.extend(TextLoader(str(TXT_DIR / "sample.txt"), encoding="utf-8").load())

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)

import pandas as pd
text_corpus = pd.DataFrame({
    "doc_id":   [f"doc{i}" for i in range(len(chunks))],
    "source":   [c.metadata.get("source") or c.metadata.get("file_path") or "uploaded" for c in chunks],
    "text":     [c.page_content for c in chunks],
})
print("✅ Text chunks:", len(text_corpus))
display(text_corpus.head(3))

✅ Text chunks: 1066


Unnamed: 0,doc_id,source,text
0,doc0,corpus/NeurIPS-2024-can-large-language-model-a...,Can Large Language Model Agents Simulate\nHuma...
1,doc1,corpus/NeurIPS-2024-can-large-language-model-a...,8University of Michigan 9Santa Fe Institute 10...
2,doc2,corpus/NeurIPS-2024-can-large-language-model-a...,"havior? In this paper, we focus on one critica..."


In [10]:
# SECTION 4 — LOAD IMAGES & OPTIONAL CAPTIONS

from PIL import Image
import pandas as pd

img_rows = []
for p in sorted(IMG_DIR.iterdir()):
    if p.suffix.lower() in [".png",".jpg",".jpeg",".webp",".bmp",".tif",".tiff"]:
        img_rows.append({"image_id": p.stem, "path": str(p), "caption": p.stem})
df_imgs = pd.DataFrame(img_rows)
print("✅ Images found:", len(df_imgs))
display(df_imgs.head(3) if len(df_imgs) else df_imgs)

# Optional: auto-caption (falls back to filename on any error)
CAPTION = True
if CAPTION and len(df_imgs):
    try:
        from transformers import pipeline
        cap_pipe = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning",
                            device=0 if torch.cuda.is_available() else -1)
        new_caps = []
        for row in df_imgs.itertuples():
            try:
                pred = cap_pipe(Image.open(row.path))[0]["generated_text"]
            except Exception:
                pred = row.caption
            new_caps.append(pred)
        df_imgs["caption"] = new_caps
        print("📝 Auto-captions generated.")
    except Exception as e:
        print("Captioning not available; keeping filenames as captions. Reason:", e)


✅ Images found: 8


Unnamed: 0,image_id,path,caption
0,AdaBoostClassifier,mm_images/AdaBoostClassifier.png,AdaBoostClassifier
1,Bernoulli,mm_images/Bernoulli.png,Bernoulli
2,DecisionTreeClassifier,mm_images/DecisionTreeClassifier.png,DecisionTreeClassifier


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


📝 Auto-captions generated.


In [11]:
# SECTION 5 — EMBEDDINGS

from sentence_transformers import SentenceTransformer
import numpy as np
from PIL import Image

# Text-only embedding
st_text = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
text_vecs_minilm = st_text.encode(
    text_corpus["text"].tolist(),
    batch_size=32, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
)

# CLIP (shared space for text & images)
st_clip = SentenceTransformer("clip-ViT-B-32")

# Images -> CLIP
pil_images = [Image.open(p).convert("RGB") for p in df_imgs["path"].tolist()] if len(df_imgs) else []
img_vecs_clip = st_clip.encode(
    pil_images, batch_size=16, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
) if len(pil_images) else np.zeros((0,512), dtype="float32")

# Text -> CLIP (for image->text)
text_vecs_clip = st_clip.encode(
    text_corpus["text"].tolist(),
    batch_size=32, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
)

def topk_cosine(q_vec, mat, k=5):
    if mat.shape[0] == 0:
        return []
    sims = mat @ q_vec
    idx = np.argsort(-sims)[:k]
    return [(int(i), float(sims[i])) for i in idx]

def encode_text_minilm(q: str):
    return st_text.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0]

def encode_text_clip(q: str):
    return st_clip.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

0_CLIPModel/pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

0_CLIPModel/model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [12]:
def retrieve_text_by_text(query, k=5):
    q = encode_text_minilm(query)
    hits = topk_cosine(q, text_vecs_minilm, k=k)
    return [(text_corpus.iloc[i].doc_id, hits[j][1]) for j,(i,_) in enumerate(hits)]

def retrieve_images_by_text(query, k=5):
    if len(df_imgs)==0: return []
    q = encode_text_clip(query)
    hits = topk_cosine(q, img_vecs_clip, k=k)
    return [(df_imgs.iloc[i].image_id, hits[j][1]) for j,(i,_) in enumerate(hits)]

def retrieve_by_image(image_id, k=5):
    if len(df_imgs)==0: return [], []
    row = df_imgs[df_imgs.image_id==image_id]
    if row.empty: return [], []
    idx = row.index[0]
    q_vec = img_vecs_clip[idx]
    text_hits = topk_cosine(q_vec, text_vecs_clip, k=k)
    img_hits  = topk_cosine(q_vec, img_vecs_clip,  k=k+1)  # may include itself
    img_hits  = [(i,s) for (i,s) in img_hits if i!=idx][:k]
    text_pairs = [(text_corpus.iloc[i].doc_id, text_hits[j][1]) for j,(i,_) in enumerate(text_hits)]
    img_pairs  = [(df_imgs.iloc[i].image_id,  img_hits[j][1])  for j,(i,_) in enumerate(img_hits)]
    return text_pairs, img_pairs

print("Text->Text sample:", retrieve_text_by_text("trend in chart 1", k=3))
print("Text->Images sample:", retrieve_images_by_text("trend in chart 1", k=3))
if len(df_imgs):
    print("Image->Docs/Images sample:", retrieve_by_image(df_imgs.iloc[0].image_id, k=3))


Text->Text sample: [('doc721', 0.28421539068222046), ('doc197', 0.2832450270652771), ('doc199', 0.27931708097457886)]
Text->Images sample: [('DecisionTreeClassifier', 0.24721695482730865), ('Bernoulli', 0.24139735102653503), ('AdaBoostClassifier', 0.23495811223983765)]
Image->Docs/Images sample: ([('doc448', 0.2896120548248291), ('doc684', 0.28658750653266907), ('doc511', 0.2837580740451813)], [('Bernoulli', 0.9123044013977051), ('DecisionTreeClassifier', 0.908728837966919), ('main_table', 0.7293365597724915)])


In [13]:

def assemble_prompt(query, text_hits, image_hits):
    tbits = []
    for doc_id, _ in text_hits:
        row = text_corpus[text_corpus.doc_id==doc_id].iloc[0]
        tbits.append(f"[{doc_id}] {row.text[:350].replace('\n',' ')}")
    ibits = []
    for image_id, _ in image_hits:
        row = df_imgs[df_imgs.image_id==image_id].iloc[0]
        ibits.append(f"[{image_id}] {row.caption}")
    return f"""System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: {query}
Evidence:
- Text: {' | '.join(tbits) if tbits else '(none)'}
- Images: {' | '.join(ibits) if ibits else '(none)'}
Answer:"""

q_demo = "Which chart shows an upward trend and what does the related text say?"
p_demo = assemble_prompt(q_demo, retrieve_text_by_text(q_demo, k=3), retrieve_images_by_text(q_demo, k=3))
print(p_demo[:800] + "\n...")


System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: Which chart shows an upward trend and what does the related text say?
Evidence:
- Text: [doc906] Figure 5: Richelieu modules benefit different LLMs. The solid line represents the experimental results for Richelieu, while the dashed line corresponds to Cicero. Different colors are used for different LLMs. The horizontal axis represents the logarithm of the number of training sessions, and the vertical axis denotes the rate. Table 3: Ablation st | [doc79] except for the last round. In other words, when the amount sent increases, the amount returned is also likely to increase. And when the amount sent remains unchanged, the amount returned also tends to be unchanged. This reflects the stable relationship between
...


In [14]:

DO_GENERATE = True  # set False to skip generation

if DO_GENERATE:
    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tok = AutoTokenizer.from_pretrained(model_id)
    m   = AutoModelForCausalLM.from_pretrained(model_id)
    gen = pipeline("text-generation", model=m, tokenizer=tok, max_new_tokens=200,
                   device=0 if torch.cuda.is_available() else -1)
    print("\n--- Generated ---")
    print(gen(p_demo)[0]["generated_text"])

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu



--- Generated ---
System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: Which chart shows an upward trend and what does the related text say?
Evidence:
- Text: [doc906] Figure 5: Richelieu modules benefit different LLMs. The solid line represents the experimental results for Richelieu, while the dashed line corresponds to Cicero. Different colors are used for different LLMs. The horizontal axis represents the logarithm of the number of training sessions, and the vertical axis denotes the rate. Table 3: Ablation st | [doc79] except for the last round. In other words, when the amount sent increases, the amount returned is also likely to increase. And when the amount sent remains unchanged, the amount returned also tends to be unchanged. This reflects the stable relationship between trust and reciprocity in humans. Specifically, the “Returned/3×Sent Ratio” in Figure 6 is | [doc640] among the agents over time, with lower entropy indicating higher consensus. The

In [15]:
import pandas as pd
rows = []
queries = [
    "Summarize the trend in Chart 1",
    "Which figure discusses baseline performance?",
    "Find a figure related to evaluation metrics",
]
for q in queries:
    t_hits = retrieve_text_by_text(q, 3)
    i_hits = retrieve_images_by_text(q, 3)
    rows.append({
        "query": q,
        "text@3": [d for d,_ in t_hits],
        "images@3": [i for i,_ in i_hits],
        "n_text": len(t_hits),
        "n_images": len(i_hits),
    })
df_preview = pd.DataFrame(rows)
display(df_preview)
df_preview.to_csv("trackB_multimodal_preview.csv", index=False)
print("Saved: trackB_multimodal_preview.csv")


Unnamed: 0,query,text@3,images@3,n_text,n_images
0,Summarize the trend in Chart 1,"[doc197, doc199, doc205]","[DecisionTreeClassifier, Bernoulli, AdaBoostCl...",3,3
1,Which figure discusses baseline performance?,"[doc662, doc897, doc619]","[ablation2, ablation4, ablation1]",3,3
2,Find a figure related to evaluation metrics,"[doc651, doc195, doc1041]","[ablation4, ablation2, main_table]",3,3


Saved: trackB_multimodal_preview.csv


In [16]:
import json, datetime
cfg = {
  "timestamp": datetime.datetime.now().isoformat(timespec="seconds"),
  "models": {
    "text_embedding": "sentence-transformers/all-MiniLM-L6-v2",
    "clip": "clip-ViT-B-32",
    "generator": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
  },
  "counts": {
    "n_text_chunks": int(len(text_corpus)),
    "n_images": int(len(df_imgs))
  },
  "paths": {"images_dir": str(IMG_DIR), "corpus_dir": str(TXT_DIR)}
}
json.dump(cfg, open("trackB_run_config.json","w"), indent=2)
print("Saved: trackB_run_config.json")


Saved: trackB_run_config.json
