In [15]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from transformers import pipeline
from huggingface_hub import login
import random
from datasets import Dataset

import torch

# torch.cuda.empty_cache()

detections_file = "../data/10k_yolo_detections.csv"
samples = 100

detections_data = pd.read_csv(detections_file)
data = detections_data

indices = data.sample(n=samples, random_state = np.random.seed(42)).index.tolist()

# Convert the list of indices to a DataFrame
indices_df = pd.DataFrame(indices, columns=['Index'])

# Save the DataFrame to a CSV file
indices_df.to_csv('indices.csv', index=False)

print("Indices saved to 'indices.csv'")

filtered_df = data.drop(indices)
filtered_indices = filtered_df.index.tolist()

dataset = Dataset.from_pandas(data)

in_context_example_indices = np.random.choice(filtered_indices, 3)

dataset[in_context_example_indices]

Indices saved to 'indices.csv'


{'Image ID': [415885, 543551, 560463],
 'Question ID': [415885005, 543551004, 560463001],
 'Question': ['Is the man holding the tennis ball?',
  'Where is this picture taken?',
  'What color is the brick walkway?'],
 'Answer': ['no', 'australia', 'green'],
 'Plausible answers': ["{'no'}",
  "{'australia', 'sydney', 'seattle', 'by large lake', 'outside', 'bay', 'on shore', 'san francisco', 'by boardwalk'}",
  "{'green', 'jade'}"],
 'Question Type': ['is the man', 'none of the above', 'what color is the'],
 'Answer Type': ['yes/no', 'other', 'other'],
 'Image file': ['../data/validation/images/COCO_val2014_000000415885.jpg',
  '../data/validation/images/COCO_val2014_000000543551.jpg',
  '../data/validation/images/COCO_val2014_000000560463.jpg'],
 'Generated Caption': ['a man in white shorts and a blue shirt is playing tennis',
  'a bus is parked on the side of the road',
  'a man holding a tennis racket'],
 'Generated Detections': ["[{'xmin': 331.1903076171875, 'ymin': 344.2440185546875,

In [34]:
pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", device='cuda')

Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.66s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 60.88 MiB is free. Process 48379 has 56.70 GiB memory in use. Including non-PyTorch memory, this process has 22.37 GiB memory in use. Of the allocated memory 21.96 GiB is allocated by PyTorch, and 5.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
templates = ["Based on the image caption (generated by BLIP model): {caption}, and Object detections (generated by YOLOv5): {detection}, Answer the question '{question}' \n Answer: ",
             "Based on the image caption (generated by BLIP model): {caption} and Object detections (generated by YOLOv5): {detection}. \n In plain simple English language without any emoticons or icons or font colors or punctuation marks. I strongly state do not repeat the question, prompt used, disclaimer, explanantion or anything apart from answer, that is just provide the answer in a single word in lowercase, the question: '{question}'. Remember if you are unable to answer the question based on the caption provided by BLIP, mention 'NA'.\nAnswer: "
    ]

def apply_template(template: str, texts: str):
    templated_text = []

    if isinstance(texts, str):
        texts = [texts]
    
    for text in texts:
        tmpltd_txt = template
        tmpltd_txt = tmpltd_txt.replace("{dialogue}",text)
        templated_text.append(tmpltd_txt)

    return templated_text



def make_prompt(in_context_example_indices, test_example_index, template):
    ### WRITE YOUR CODE HERE 
    icl_captions = dataset[in_context_example_indices]['Generated Caption']
    icl_detections = dataset[in_context_example_indices]['Generated Detections']
    icl_questions = dataset[in_context_example_indices]['Question']
    icl_answers = dataset[in_context_example_indices]['Answer']

    prompt = r""
    for caption, detection, question, answer in zip(icl_captions, icl_detections, icl_questions, icl_answers):
        print(caption, detection, question, answer)
        tmpltd_txt = template
        tmpltd_txt = tmpltd_txt.replace("{caption}",caption)
        tmpltd_txt = tmpltd_txt.replace("{detection}",detection)
        tmpltd_txt = tmpltd_txt.replace("{question}",question)
        prompt += f"\n{tmpltd_txt}\n" + f"{answer}\n\n\n"

    test_caption = dataset[test_example_index]['Generated Caption']
    test_detection = dataset[test_example_index]['Generated Detections']
    test_question = dataset[test_example_index]['Question']
    test_answer = dataset[test_example_index]['Answer']

    tmpltd_txt = template
    tmpltd_txt = tmpltd_txt.replace("{caption}",test_caption)
    tmpltd_txt = tmpltd_txt.replace("{detection}",test_detection)
    tmpltd_txt = tmpltd_txt.replace("{question}",test_question)
    
    prompt += f"\n{tmpltd_txt}\n"

    return prompt

In [None]:
results = []
for each_index in indices:
    in_context_example_indices = np.random.choice(filtered_indices, 3)
    template_prompt = make_prompt(in_context_example_indices, each_index, templates[0])
    generated_txt = pipe(template_prompt, max_new_tokens=3)[0]['generated_text']
    results.append(generated_txt)
    break

In [24]:
templates[0][0]

"Based on the image caption (generated by BLIP model): {caption}, and Object detections (generated by YOLOv5): {detection}, Answer the question '{question}' \n Answer: "