In [1]:
#@title 0) Setup & Load Images Metadata
import os, pandas as pd
from PIL import Image



meta = [
    {"image_id": "img1", "path": "/content/img_1.png", "caption": "Blue candlestick chart showing stock price movement over time."},
    {"image_id": "img2", "path": "/content/img_2.png", "caption": "Line chart illustrating upward trend in stock index."},
    {"image_id": "img3", "path": "/content/img_3.png", "caption": "Financial candlestick chart with volume overlay."},
    {"image_id": "img4", "path": "/content/img_4.png", "caption": "Combined line and candlestick chart of stock & trend indicator."}
]

df_imgs = pd.DataFrame(meta)
print("✅ Loaded image metadata:", len(df_imgs))
display(df_imgs.head())


✅ Loaded image metadata: 4


Unnamed: 0,image_id,path,caption
0,img1,/content/img_1.png,Blue candlestick chart showing stock price mov...
1,img2,/content/img_2.png,Line chart illustrating upward trend in stock ...
2,img3,/content/img_3.png,Financial candlestick chart with volume overlay.
3,img4,/content/img_4.png,Combined line and candlestick chart of stock &...


In [2]:
# 1) Load text corpus (from Track A CSV)
import pandas as pd
text_corpus = pd.read_csv("/content/corpus_chunks.csv")
text_corpus = text_corpus[['doc_id', 'text']]
print("✅ Loaded text corpus:", text_corpus.shape)
display(text_corpus.head(3))

# 2) Load CLIP model (for both text + images)
from transformers import CLIPModel, CLIPProcessor
import torch
import numpy as np

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

# 3) Build text embeddings
text_emb = {}
for _, row in text_corpus.iterrows():
    inputs = processor(text=row.text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        emb = clip_model.get_text_features(**inputs).squeeze().cpu().numpy()
    emb = emb / (np.linalg.norm(emb) + 1e-9)
    text_emb[row.doc_id] = emb.astype("float32")

print("✅ Text embeddings built:", len(text_emb))

# 4) Build image embeddings
img_emb = {}
for _, row in df_imgs.iterrows():
    image = Image.open(row['path']).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs).squeeze().cpu().numpy()
    emb = emb / (np.linalg.norm(emb) + 1e-9)
    img_emb[row.image_id] = emb.astype("float32")

print("✅ Image embeddings built:", len(img_emb))

# 5) Cosine similarity + query encoder
def cosine(a,b):
    return float(a @ b / (np.linalg.norm(a)+1e-9) / (np.linalg.norm(b)+1e-9))

def encode_text(q):
    inputs = processor(text=q, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        emb = clip_model.get_text_features(**inputs).squeeze().cpu().numpy()
    emb = emb / (np.linalg.norm(emb) + 1e-9)
    return emb.astype("float32")


✅ Loaded text corpus: (10, 2)


Unnamed: 0,doc_id,text
0,Dave2025_MASStock,This work surveys multi-agent architectures fo...
1,Dave2025_MASStock,Experiments compare multi-agent systems on Tes...
2,FinVision2024,The FinVision framework introduces a multimoda...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Text embeddings built: 5
✅ Image embeddings built: 4


In [3]:
# 3) Retrieval Modes

def retrieve_text(query, k=3):
    q = encode_text(query)
    scores = [(doc, cosine(q, text_emb[doc])) for doc in text_emb]
    scores = sorted(scores, key=lambda x: -x[1])[:k]
    return [(doc, round(score, 3), text_corpus.loc[text_corpus['doc_id']==doc, 'text'].values[0]) for doc, score in scores]

def retrieve_image_by_text(query, k=3):
    q = encode_text(query)
    scores = [(img, cosine(q, img_emb[img])) for img in img_emb]
    scores = sorted(scores, key=lambda x: -x[1])[:k]
    return [(img, round(score, 3), df_imgs.loc[df_imgs['image_id']==img, 'caption'].values[0]) for img, score in scores]

def retrieve_by_image(image_id, k=3):
    q = img_emb[image_id]
    # Image -> Text
    t_scores = [(doc, cosine(q, text_emb[doc])) for doc in text_emb]
    t_scores = sorted(t_scores, key=lambda x: -x[1])[:k]
    top_text = [(doc, round(score, 3), text_corpus.loc[text_corpus['doc_id']==doc, 'text'].values[0]) for doc, score in t_scores]
    # Image -> Image
    i_scores = [(img, cosine(q, img_emb[img])) for img in img_emb if img != image_id]
    i_scores = sorted(i_scores, key=lambda x: -x[1])[:k]
    top_imgs = [(img, round(score, 3), df_imgs.loc[df_imgs['image_id']==img, 'caption'].values[0]) for img, score in i_scores]
    return top_text, top_imgs

# 🔹 Demo Run
print("\nText → Docs")
for r in retrieve_text("How does sentiment correlate with stock prices?", 3):
    print(r)

print("\nText → Images")
for r in retrieve_image_by_text("NASDAQ candlestick chart", 3):
    print(r)

print("\nImage → Docs & Images (using img2)")
t_hits, i_hits = retrieve_by_image("img2", 3)
print("Image→Text results:")
for r in t_hits: print(r)
print("Image→Image results:")
for r in i_hits: print(r)



Text → Docs
('FinVision2024', 0.83, 'The FinVision framework introduces a multimodal multi-agent approach to stock prediction. Agents use candlestick chart embeddings and textual news to improve decision making.')
('FSA_RAG2023', 0.816, 'We propose retrieval-augmented large language models for financial sentiment analysis. External knowledge improves factual grounding in responses to market-related queries.')
('SentimentSurvey2024', 0.8, 'Financial Sentiment Analysis is divided into two main streams: method development, including lexicon-based and machine learning approaches, and market applications such as forecasting and risk assessment.')

Text → Images
('img4', 0.268, 'Combined line and candlestick chart of stock & trend indicator.')
('img2', 0.267, 'Line chart illustrating upward trend in stock index.')
('img1', 0.248, 'Blue candlestick chart showing stock price movement over time.')

Image → Docs & Images (using img2)
Image→Text results:
('FinVision2024', 0.297, 'The FinVision f

In [4]:
# 4) Prompt Assembly
def assemble_prompt(query, text_hits, image_hits):
    # text_hits is [(doc_id, score, snippet), ...]
    # image_hits is [(img_id, score, caption), ...]
    tbits = [f"[{doc}] {snippet}" for doc, _, snippet in text_hits]
    ibits = [f"[{img}] {caption}" for img, _, caption in image_hits]

    return f"""System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: {query}
Evidence:
- Text: {' | '.join(tbits)}
- Images: {' | '.join(ibits)}
Answer:
"""

# 🔹 Demo Run
q = "How does sentiment correlate with Tesla stock price?"
print(assemble_prompt(q, retrieve_text(q, 2), retrieve_image_by_text(q, 2)))


System: Answer using ONLY the evidence below. Cite [doc_id] or [image_id].
Query: How does sentiment correlate with Tesla stock price?
Evidence:
- Text: [Dave2025_MASStock] This work surveys multi-agent architectures for stock market forecasting. We examine reinforcement learning, attention-based networks, and simulation-driven trading strategies. The focus is on how agents cooperate in real-time trading. | [FinVision2024] The FinVision framework introduces a multimodal multi-agent approach to stock prediction. Agents use candlestick chart embeddings and textual news to improve decision making.
- Images: [img4] Combined line and candlestick chart of stock & trend indicator. | [img2] Line chart illustrating upward trend in stock index.
Answer:



In [5]:
print('A) Text-only -> retrieve text + images-by-text')
q1 = "How does sentiment correlate with Tesla stock price?"
print(f"Query: {q1}\n")

print("Text results:")
for r in retrieve_text(q1, 3):
    print(r)

print("\nImages-from-text results:")
for r in retrieve_image_by_text(q1, 3):
    print(r)

print('\nB) Image-only -> retrieve related docs & similar images')
t_hits, i_hits = retrieve_by_image('img2', 3)

print("\nDocs related to img2:")
for r in t_hits:
    print(r)

print("\nSimilar images to img2:")
for r in i_hits:
    print(r)


A) Text-only -> retrieve text + images-by-text
Query: How does sentiment correlate with Tesla stock price?

Text results:
('Dave2025_MASStock', 0.799, 'This work surveys multi-agent architectures for stock market forecasting. We examine reinforcement learning, attention-based networks, and simulation-driven trading strategies. The focus is on how agents cooperate in real-time trading.')
('FinVision2024', 0.778, 'The FinVision framework introduces a multimodal multi-agent approach to stock prediction. Agents use candlestick chart embeddings and textual news to improve decision making.')
('FSA_RAG2023', 0.752, 'We propose retrieval-augmented large language models for financial sentiment analysis. External knowledge improves factual grounding in responses to market-related queries.')

Images-from-text results:
('img4', 0.291, 'Combined line and candlestick chart of stock & trend indicator.')
('img2', 0.289, 'Line chart illustrating upward trend in stock index.')
('img1', 0.282, 'Blue cand