In [5]:
import os, torch, torch._dynamo
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from titans_pytorch import MemoryAsContextTransformer
import torch.nn.functional as F

torch._dynamo.config.suppress_errors = True
device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# ── auth / model ────────────────────────────────────────────────────────────────
load_dotenv();  login(os.getenv("ACCESS_TOKEN"))
model_id   = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer  = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
llm        = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.float16, device_map=device, use_auth_token=True
).eval()



In [6]:
# ── memory transformer ─────────────────────────────────────────────────────────
mem_tx = MemoryAsContextTransformer(
    num_tokens=len(tokenizer),
    dim=llm.config.hidden_size,
    depth=2,
    segment_len=128,
    neural_memory_segment_len=16,
    num_persist_mem_tokens=64,
    num_longterm_mem_tokens=128,
    neural_memory_kwargs=dict(
        heads=8, dim_head=256, pre_rmsnorm=True, post_rmsnorm=True,
        qk_rmsnorm=True, attn_pool_chunks=True, momentum=True, momentum_order=1,
    ),
    use_flex_attn=True,
    sliding_window_attn=False,
).to(device)

In [7]:
import cv2
from ultralytics import YOLO

# ── detect objects in video ────────────────────────────────────────────────────
cap   = cv2.VideoCapture("zebra.mp4")
yolo  = YOLO("yolov8n.pt")
sent  = []
f_idx = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    f_idx += 1
    res = yolo(frame, verbose=False)[0]
    objs = {yolo.model.names[int(c)] for c in res.boxes.cls}
    sent.append(f"Frame {f_idx}: " + (", ".join(sorted(objs)) if objs else "no objects detected"))

cap.release()

# ── build doc from detections ──────────────────────────────────────────────────
doc    = ". ".join(sent) + "."
chunks = [s.strip() + "." for s in doc.split(". ") if s]
chunk_embs = []

for ch in chunks:
    ids  = tokenizer(ch, return_tensors="pt").input_ids.to(device)
    loss = mem_tx(ids, return_loss=True)
    loss.backward()

    with torch.no_grad():
        emb = mem_tx.token_emb(ids).mean(dim=1).squeeze(0)
        chunk_embs.append(emb)


In [7]:
# ── document ────────────────────────────────────────────────────────────
doc = "Artificial intelligence (AI) is rapidly transforming the modern world. From personalized recommendations on streaming platforms to autonomous vehicles navigating complex environments, AI technologies are embedded into the fabric of daily life. Businesses are increasingly investing in AI to streamline operations, gain insights from data, and improve customer experiences. This shift is not only technological but also cultural, as organizations adapt to new ways of thinking and working. A key driver of this transformation is machine learning (ML), which allows systems to learn from data and improve over time without explicit programming. Among the most impactful ML techniques are deep learning and reinforcement learning. These methods have led to breakthroughs in fields such as computer vision, natural language processing, and robotics. However, they also introduce new challenges, such as the need for vast amounts of training data and computational resources. Ethical concerns surrounding AI are also growing. Issues such as algorithmic bias, lack of transparency, and potential job displacement are prompting discussions among policymakers, researchers, and the public. Efforts are underway to create regulatory frameworks and ethical guidelines to ensure AI is developed and deployed responsibly. Transparency, fairness, and accountability are emerging as key principles in the field. Despite the challenges, the future of AI holds great promise. Innovations in hardware, such as neuromorphic chips, and advances in software, like transformer-based models, are pushing the boundaries of what AI systems can achieve. As interdisciplinary collaboration continues to grow, AI is likely to become even more integral to solving complex global problems in healthcare, climate science, education, and beyond."
chunks = [s.strip() + "." for s in doc.split(". ") if s]
chunk_embs = []

for ch in chunks:
    ids  = tokenizer(ch, return_tensors="pt").input_ids.to(device)
    loss = mem_tx(ids, return_loss=True)
    loss.backward()

    with torch.no_grad():
        emb = mem_tx.token_emb(ids).mean(dim=1).squeeze(0)   # (dim,)
        chunk_embs.append(emb)


In [4]:
from torch.nn.functional import cosine_similarity

def summarise(question: str,
              *,
              k: int = 3,
              max_new_tokens: int = 128,
              temperature: float = 0.7) -> str:
    """Retrieve top-k relevant chunks and let LLaMA answer."""
    # ---- embed the question in the same space -------------------------------
    q_ids = tokenizer(question,
                      add_special_tokens=False,
                      return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        q_emb = mem_tx.token_emb(q_ids).mean(dim=1)          # (1, dim)

        # ---- similarity against all chunk embeddings ------------------------
        sims = torch.stack([
            cosine_similarity(q_emb, emb.unsqueeze(0), dim=-1)[0]
            for emb in chunk_embs
        ])                                                    # (num_chunks,)

        topk = sims.topk(min(k, len(chunks))).indices.tolist()

    # ---- build text prompt for LLaMA ----------------------------------------
    retrieved = "\n\n".join(chunks[i] for i in topk)
    prompt = f"""### Context
{retrieved}

### Question
{question}

### Answer
"""

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    with torch.no_grad():
        out_ids = llm.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
        )

    completion = out_ids[0][input_ids.shape[1]:]
    return tokenizer.decode(completion, skip_special_tokens=True).strip()

print(summarise("what animal is this?"))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


a monkey. 

Frame 301: no objects detected.

Frame 171: no objects detected.

Frame 265: bench.

### Explanation
In Frame 301, no objects are detected, which means the environment is empty. The question is then asked about the animal that this environment contains. Since no objects are detected, the answer is not based on the objects themselves, but rather on the fact that the environment is empty. In this case, the answer is a monkey, as monkeys are typically found in trees or other elevated environments.


In [None]:
import textwrap

captions = []
for end in range(60, len(sent) + 1, 60):
    # build context from frames 1..end
    context = ". ".join(sent[:end]) + "."
    prompt = f"""### Context
{context}

### Caption
"""
    # tokenize and generate
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        out_ids = llm.generate(
            input_ids,
            max_new_tokens=50,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
        )
    # decode and store the caption
    caption = tokenizer.decode(out_ids[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
    captions.append(caption)

print(captions)

# reopen video and prepare writer
cap2 = cv2.VideoCapture("zebra.mp4")
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
fps   = cap2.get(cv2.CAP_PROP_FPS)
w     = int(cap2.get(cv2.CAP_PROP_FRAME_WIDTH))
h     = int(cap2.get(cv2.CAP_PROP_FRAME_HEIGHT))
out   = cv2.VideoWriter("zebra_with_captions.mp4", fourcc, fps, (w, h))

frame_idx = 0
while True:
    ret, frame = cap2.read()
    if not ret:
        break

    # pick caption based on which 60-frame segment
    seg = min(frame_idx // 100, len(captions) - 1)
    text = captions[seg]

    # wrap the caption to fit the frame width
    max_chars_per_line = 20 
    wrapped_lines = textwrap.wrap(text, width=max_chars_per_line)

    # overlay each line at the bottom, stacking upwards
    line_height = 45  # vertical space per line
    for i, line in enumerate(reversed(wrapped_lines)):
        y = h - 20 - i * line_height
        cv2.putText(
            frame,
            line,
            (10, y),
            cv2.FONT_HERSHEY_SIMPLEX,
            1.5,
            (255, 255, 255),
            2,
            cv2.LINE_AA
        )

    out.write(frame)
    frame_idx += 1

cap2.release()
out.release()

['A simple stop sign with a black and white design. The sign has a circular frame with a white background. The frame contains a black and white image of a zebra. The image is centered and the zebra is shown in a simple pose.', '"Stop sign and zebra. Stop sign and zebra. Stop sign and zebra. Stop sign and zebra. Stop sign and zebra. Stop sign and zebra. Stop sign and zebra. Stop sign and zebra. Stop', '"Frame 1: stop sign, zebra. Frame 2: stop sign, zebra. Frame 3: stop sign, zebra. Frame 4: stop sign, zebra. Frame 5: stop sign, zebra', 'A person is standing next to a stop sign with a zebra on it. The stop sign is flanked by two zebra symbols. The caption reads "A classic combination: a person and a stop sign with a zebra on it. A', '"Stop signs and zebras are the best friends anyone could ask for. They\'re always together, and they\'re always on the same team. They may not be able to speak to each other, but they can certainly talk to us."', 'The image is of a stop sign with a zebra. T