In [None]:
import os
import tqdm
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

import torch
import umap
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from internvl import InternVL
from clip_retrieval import RAGWithCLIP

# clip = RAGWithCLIP("openai/clip-vit-base-patch32", source='huggingface')
# clip = RAGWithCLIP("openai/clip-vit-large-patch14-336", source='huggingface')

# clip = RAGWithCLIP("RN50/openai", source='open_clip')
# clip = RAGWithCLIP("ViT-B-16/dfn2b", source='open_clip')
clip = RAGWithCLIP("ViT-H-14-378-quickgelu/dfn5b", source='open_clip')

# vlm =  InternVL('OpenGVLab/InternVL2-1B')
# vlm =  InternVL('OpenGVLab/InternVL2-8B')
vlm =  InternVL('OpenGVLab/InternVL2-40B')


In [None]:
def get_reduced_embeddings(frame_features):
    if isinstance(frame_features, torch.Tensor):
        features = frame_features.cpu().numpy()
    
    reducer = umap.UMAP()
    # reducer = TSNE()
    embedding = reducer.fit_transform(features)

    return embedding

def plot_semantic_map(embedding, scores):
    plt.figure(figsize=(20, 3))
    plt.subplot(1, 2, 1)

    plt.scatter(embedding[:, 0], embedding[:, 1], c=np.arange(len(embedding)), cmap='viridis', s=50, alpha=0.2)

    for i in tqdm.tqdm(range(len(embedding)-1)):
        plt.plot(embedding[i:i+2, 0], embedding[i:i+2, 1], c='black', alpha=0.2, linewidth=1)
    plt.title('Frame features colored by frame index')
    plt.axis('off')

    plt.subplot(1, 2, 2)
    
    # mark points with top 5 highets scores
    top5 = scores.argsort()[-5:]
    plt.scatter(embedding[top5, 0], embedding[top5, 1], c='red', s=100, alpha=1)

    plt.scatter(embedding[:, 0], embedding[:, 1], c=scores, cmap='viridis', s=50, alpha=0.2)
    for i in tqdm.tqdm(range(len(embedding)-1)):
        plt.plot(embedding[i:i+2, 0], embedding[i:i+2, 1], c='black', alpha=0.2, linewidth=1)

    plt.title('Frame features colored by CLIP similarity')
    plt.axis('off')
    plt.show()


In [None]:
import os
import json
import glob
import argparse
from pathlib import Path
import sys
import numpy as np

topk = 5

dataset = '../data/open-eqa-v0.json'
# dataset = '../data/caree-eqa-v0.json'

def get_frames(episode_history):
    if "open-eqa-v0" in dataset:
        frames = sorted(glob.glob(f"/share/open-eqa/frames/{episode_history}/*-rgb.png"))

    elif "caree-eqa-v0" in dataset:
        frames = sorted(glob.glob(f"/share/open-eqa/frames/{episode_history}/*-rgb.png"))

    return frames


def get_video_path(episode_history):
    # convert Path to string
    if "open-eqa-v0" in dataset:
        video_path = f"/share/open-eqa/videos/{episode_history}-0.mp4"
    elif "caree-eqa-v0" in dataset:
        #  "episode_history": "caree-v0/caree-1"
        video_path = f"/share/care-e/testing_videos/{episode_history.split('/')[-1]}.mp4"
    else:
        assert False, "Unknown dataset"
    return video_path

eqa_data = json.load(open(dataset))[:50]

last_episode_history = None
for i, item in enumerate(eqa_data):
    q = item["question"]
    g = item["answer"]

    episode_history = item["episode_history"]

    if last_episode_history != episode_history:
        frames = get_frames(episode_history)
        print(f"Encoding {len(frames)} frames for {episode_history}")
        clip.encode_frames(frames, batch_size=8)
        last_episode_history = episode_history
        # embedding = get_reduced_embeddings(clip.frame_features)

    for k, v in item.items():
        print(f"{k:20}\t{v}")

    text_features = clip.get_text_features([q])
    scores = ((clip.frame_features @ text_features.T).T).cpu().numpy().squeeze()
    # plot_semantic_map(embedding, scores)

    (window, sigma) = (0, 0) if 'hm3d' in item['episode_history'] else (15, 4)
    image_paths = clip.search(q, top_k=topk, window=window, sigma=sigma, do_visualization=True)

    prompt = f"""
You are an intelligent question answering agent. I will ask you questions about an indoor space and you must provide an answer.
You will be shown a set of images that have been collected from a single location.
Given a user query, you must output direct and concise one-sentence answer to the question asked by the user.
Images: {"<image>" * len(image_paths)}
User Query: {q}"""

    answer, _ = vlm(prompt=prompt, image_paths=image_paths)
    print(f'Question:    \t{q}')
    print(f'Ground truth:\t{g}')
    print(f'Prediction:  \t{answer}')
    print('= '*128)