# Import libraries

In [None]:
!pip install bitsandbytes>=0.39.0 accelerate>=0.20.0
!pip install transformers

In [None]:
import inspect
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# Instantiate model

In [None]:
# Instantiate BLIP-2 model. via
# https://huggingface.co/Salesforce/blip2-flan-t5-xxl
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xxl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xxl", device_map="auto", load_in_8bit=True)

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store inferences

In [None]:
os.makedirs('/content/drive/MyDrive/stance_detection_datasets/inferences/predictions', exist_ok=True)

# Import datasets

## COVID-19

In [None]:
constraint22_dataset_covid19_test_captioned_Qwen_VL_Chat = pd.read_csv('/content/drive/MyDrive/stance_detection_datasets/inferences/constrain22_dataset_covid19_test_captioned_Qwen-VL-Chat.csv')

In [None]:
constraint22_dataset_covid19_test_grounded_Qwen_VL_Chat = pd.read_csv('/content/drive/MyDrive/stance_detection_datasets/inferences/constrain22_dataset_covid19_test_grounded_Qwen-VL-Chat.csv')

In [None]:
constraint22_dataset_covid19_test_captioned_Qwen_VL_Chat = pd.concat([constraint22_dataset_covid19_test_captioned_Qwen_VL_Chat, constraint22_dataset_covid19_test_grounded_Qwen_VL_Chat], axis=1).T.drop_duplicates().T

In [None]:
constraint22_dataset_covid19_test_captioned_Qwen_VL_Chat = constraint22_dataset_covid19_test_captioned_Qwen_VL_Chat.dropna().reset_index(drop=True)

# Set prompts and define a function to call the model

In [None]:
def prompt_with_caption(entity, caption):
    return inspect.cleandoc(f"""
    Description of this meme: \"\"\"
    {caption}
    \"\"\"
    What is the role of {entity} in this meme?
    hero: presented in a positive light.
    villain: portrayed negatively, e.g., in an association with adverse traits like wickedness, cruelty, hypocrisy, etc.
    victim: portrayed as suffering the negative impact of someone else’s actions.
    other: not a hero, a villain, or a victim.
    Constraint: Without using any other words, answer either hero, villain, victim, other.""")

In [None]:
# Use BLIP-2 for the inference. via
# https://huggingface.co/Salesforce/blip2-flan-t5-xxl
def get_prediction(image, prompt):
    raw_image = Image.open(image).convert('RGB')
    inputs = processor(raw_image, prompt, return_tensors="pt").to("cuda")
    out = model.generate(**inputs, max_new_tokens=30)
    return processor.decode(out[0], skip_special_tokens=True)

# Define function to clean responses

In [None]:
def remap(x):
    x = x.lower()
    if x in {'hero', 'villain', 'victim', 'other'}:
        return x
    elif 'hero' in x and 'villain' not in x and 'victim' not in x and 'other' not in x and 'not hero' not in x and 'not a hero' not in x:
        return 'hero'
    elif 'villain' in x and 'hero' not in x and 'victim' not in x and 'other' not in x and 'not villain' not in x and 'not a villain' not in x:
        return 'villain'
    elif 'victim' in x and 'villain' not in x and 'hero' not in x and 'other' not in x and 'not victim' not in x and 'not a victim' not in x:
        return 'victim'
    elif 'other' in x and 'villain' not in x and 'victim' not in x and 'hero' not in x and 'not other' not in x and 'not an other' not in x:
        return 'other'
    else:
        return None

# Call the `get_prediction` function and save inferences

GROUNDING [PRESENT] & PROMPT [CAPTION] & CAPTION [Qwen_VL_Chat]

In [None]:
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat = constraint22_dataset_covid19_test_captioned_Qwen_VL_Chat.copy(deep=True)
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat_images = covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['image_with_grounding'].values
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat_entities = covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['entity'].values
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat_captions = covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['caption'].values
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'] = [get_prediction(image, prompt_with_caption(entity, caption)) for image, entity, caption in zip(covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat_images, covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat_entities, covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat_captions)]
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat.to_csv('/content/drive/MyDrive/stance_detection_datasets/inferences/predictions/constraint22_dataset_covid19_test_grounding[PRESENT]_caption[Qwen-VL-Chat]_prompt[CAPTION]_prediction[BLIP_2].csv', index=False)
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'] = covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].apply(lambda x: remap(x))
print(covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].value_counts())
print(covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].isna().sum())
covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'] = covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].apply(lambda x: x if x is not None else np.random.choice(['hero', 'villain', 'victim', 'other']))
print(f1_score(covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['role'].values, covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].values, labels=['hero', 'villain', 'victim', 'other'], average='macro'))
print(classification_report(covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['role'].values, covid19_test_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].values, labels=['hero', 'villain', 'victim', 'other']))

other      287
hero       255
villain    129
victim      87
Name: prediction, dtype: int64
0
0.5093047782893898
              precision    recall  f1-score   support

        hero       0.55      0.75      0.64       189
     villain       0.64      0.43      0.51       190
      victim       0.64      0.30      0.41       189
       other       0.40      0.61      0.48       190

    accuracy                           0.52       758
   macro avg       0.56      0.52      0.51       758
weighted avg       0.56      0.52      0.51       758

