### Base line model

In [5]:
# !pip uninstall -y torch torchvision torchaudio 
# huggingface transformers

Found existing installation: torch 1.13.1
Uninstalling torch-1.13.1:
  Successfully uninstalled torch-1.13.1
Found existing installation: torchvision 0.14.1
Uninstalling torchvision-0.14.1:
  Successfully uninstalled torchvision-0.14.1
Found existing installation: torchaudio 0.8.1
Uninstalling torchaudio-0.8.1:
  Successfully uninstalled torchaudio-0.8.1


In [16]:
#### CPU setup
# !pip install huggingface transformers torch torchvision

#### GPU setup
!nvcc --version
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu112/torch_stable.html
# !pip install torch==1.13.1 torchvision==0.9.1 torchaudio===0.8.1 -f https://download.pytorch.org/whl/cu121/torch_stable.html
    
# !pip install torch==2.2.1+cu121 torchaudio==2.2.1+cu121 torchdata==0.7.1 torchsummary==1.5.1 torchtext==0.17.1 torchvision==0.17.1+cu121 tornado==6.3.3 tqdm==4.66.2

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement torch==2.2.1+cu121 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1)[0m
[31mERROR: No matching distribution found for torch==2.2.1+cu121[0m


In [3]:
import torch 
from transformers import pipeline  # Use a pipeline as a high-level helper
import os # For all os level functions
import json # For parsing the json files


In [2]:
import torch
print(torch.version.cuda)


None


#### Util functions



In [4]:
### Setup the questions and answers for the image files
def obtain_questions_for_image_ids(image_ids: list, question_file_path: str= "questions.json"):
    """This function obtains the questions for the list of the image ids passed from the file"""
    with open(question_file_path) as file:
        questions_data = json.load(file)
    result = []
    for question_details in questions_data['questions']:
        if question_details['image_id'] in image_ids:
            result.append(question_details)
    return result


def obtain_annotations_for_image_ids(image_ids: list, annotations_file_path: str = "annotations.json"):
    """This function obtians the annotations for the list of the image ids passed from the file"""
    with open(annotations_file_path, 'r') as file:
        annotations_data = json.load(file)
    result = []
    for annotation_details in annotations_data['annotations']:
        if annotation_details['image_id'] in image_ids:
            result.append(annotation_details)
    return result


def get_refined_results_from_lists(relevant_questions:list, relevant_annotations:list):
    """This provides refined format of the question, image, plausible answers and other meta info"""
    result = []
    for question_info in relevant_questions:
        q_id = question_info['question_id']
        question = question_info['question']
        # print(f"Question ID: {q_id}, Question: {question}")
        for annotation_info in relevant_annotations:
            if annotation_info['question_id'] == q_id:
                answer = annotation_info["multiple_choice_answer"]
                plausile_answers = set(x['answer'] for x in annotation_info['answers'])
                result.append({"question_id": q_id, "question": question, 
                               "answer": answer, "plausible_answers": plausile_answers,
                               "image_id": annotation_info['image_id'], "question_type" :annotation_info['question_type'],
                               "answer_type": annotation_info['answer_type']})

    return result

def get_results(image_ids: list, folder: str):
    relevant_annotations = obtain_annotations_for_image_ids(image_ids, annotations_file_path=f"{folder}/annotations.json")
    relevant_questions = obtain_questions_for_image_ids(image_ids, question_file_path=f"{folder}/questions.json")
    return get_refined_results_from_lists(relevant_questions=relevant_questions, relevant_annotations=relevant_annotations)



In [5]:
validation_folder = "../data/validation"
validation_images_folder = f"{validation_folder}/images"

validation_image_files = [os.path.join(validation_images_folder, x) for x in os.listdir(validation_images_folder)]

sample_validation_image_files = validation_image_files[:10]

#### Setup for analysis

In [6]:
image_files = sample_validation_image_files
image_ids = [int(image_file.split(os.sep)[-1].split(".")[0].split("_")[-1]) for image_file in image_files]
image_ids

relevant_annotations = obtain_annotations_for_image_ids(image_ids, annotations_file_path=f"{validation_folder}/annotations.json")
relevant_questions = obtain_questions_for_image_ids(image_ids, question_file_path=f"{validation_folder}/questions.json")

results = get_results(image_ids, validation_folder)
print(image_ids)

[42, 73, 74, 133, 136, 139, 143, 164, 192, 196]


### Setup the baseline performance


In [7]:
## Obtain the captions for these image files
from transformers import pipeline
blipPipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
generated_captions = blipPipe(image_files)


#     generated_captions = [{"image_caption": caption, "image_id": image_id, "image_file": image_file.split(os.sep)[-1]} 




In [8]:
### Get the generated text along with image_id, image_file
captions = [x[0]['generated_text'] for x in generated_captions]

lookup = dict()
for caption, image_id, image_file in zip(captions, image_ids, image_files):
    lookup[image_id]= (image_file, caption)

In [10]:
### Hugging Face token for running the application

import os
from huggingface_hub import login
token = "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"

login(token=token) ## This is bound to fail, add your token from chat and run

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/samavedam.m/.cache/huggingface/token
Login successful


In [11]:
# Use a pipeline as a high-level helper
from transformers import pipeline

llamaPipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")

HBox(children=(FloatProgress(value=0.0, description='Loading checkpoint shards', max=2.0, style=ProgressStyle(…




Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [12]:
### Zero Shot Performance
import tqdm


In [14]:
answers = []
for instance in results:
    img_id = instance['image_id']
    question = instance['question']
    file, caption = lookup[img_id]
    prompt = f"Based on the image caption (generated by BLIP model): {caption}, Answer the question '{question}' \n Answer: "
    gen_text = llamaPipe(prompt)
    print(f"Generated text: {gen_text}")
    answers.append(gen_text)
# llamaPipe("What are the benefits of being vegetarian? Write a brief summary around the advantages and disadvantages of being vegetarian")

Generated text: [{'generated_text': "Based on the image caption (generated by BLIP model): there is a dog that is laying on a rack with shoes, Answer the question 'What color are the gym shoes?' \n Answer: 🔵 blue.\n\nNote: The image caption is generated by BLIP model, which is a text-to-image model that can generate images based on textual descriptions. In this case, the BLIP model generated the image caption 'There is a dog that is laying on a rack with shoes' based on the input text 'There is a dog that is laying on a rack with shoes.'"}]
Generated text: [{'generated_text': "Based on the image caption (generated by BLIP model): there is a dog that is laying on a rack with shoes, Answer the question 'Is there a red sandal here?' \n Answer:  Yes."}]
Generated text: [{'generated_text': "Based on the image caption (generated by BLIP model): there is a dog that is laying on a rack with shoes, Answer the question 'What color is the flip flop?' \n Answer:  Brown.\n\nExplanation:  Based on t

KeyboardInterrupt: 

In [None]:
answers

In [14]:
# for x in relevant_annotations:
#     print(f"multiple choice answer: {x['multiple_choice_answer']}")


In [15]:
prompt_answers1 = []
for instance in results:
    img_id = instance['image_id']
    question = instance['question']
    file, caption = lookup[img_id]
    prompt = f"Based on the image caption (generated by BLIP model): {caption}. \n In plain simple English language without any emoticons or icons or font colors or punctuation marks. I strongly state do not repeat the question, prompt used, disclaimer, or anything apart from answer, that is just provide the answer in a single word in lowercase, the question: '{question}'. Remember if you are unable to answer the question based on the caption provided by BLIP, mention 'NA'.\nAnswer: "
    gen_text = llamaPipe(prompt)
    prompt_answers1.append(gen_text)
# llamaPipe("What are the benefits of being vegetarian? Write a brief summary around the advantages and disadvantages of being vegetarian")

In [None]:
results[0]['answer']

In [None]:
prompt_answers1

In [None]:
#### Configuration to generate only 3 tokens at max. 

prompt_with_generation_cfg_answers = []
for instance in results:
    img_id = instance['image_id']
    question = instance['question']
    file, caption = lookup[img_id]
    prompt = f"Based on the image caption (generated by BLIP model): {caption}. \n In plain simple English language without any emoticons or icons or font colors or punctuation marks, Answer in a single word the question: '{question}'. Remember if you are unable to answer the question based on the caption provided by BLIP, mention 'NA'.\nAnswer: "
    gen_text = llamaPipe(prompt)
    prompt_with_generation_cfg_answers.append(gen_text)
# llamaPipe("What are the benefits of being vegetarian? Write a brief summary around the advantages and disadvantages of being vegetarian")

In [None]:
prompt_with_generation_cfg_answers

In [None]:
#### ToDo 
### Write a function to evaluate the model performance as accuracy or the model

### Experiment with prompt to provide a single word answer.

### 