# Import libraries

In [None]:
!pip install bitsandbytes>=0.39.0 accelerate>=0.20.0
!pip install transformers

In [None]:
import inspect
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# Instantiate model

In [None]:
# Instantiate BLIP-2 model. via
# https://huggingface.co/Salesforce/blip2-flan-t5-xxl
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xxl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xxl", device_map="auto", load_in_8bit=True)

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store inferences

In [None]:
os.makedirs('/content/drive/MyDrive/stance_detection_datasets/inferences/predictions', exist_ok=True)

# Import datasets

## DISARM

In [None]:
DISARM_test_all_captioned_Qwen_VL_Chat = pd.read_csv('/content/drive/MyDrive/stance_detection_datasets/inferences/DISARM_test_all_captioned_Qwen-VL-Chat.csv')

In [None]:
DISARM_test_all_grounded_Qwen_VL_Chat = pd.read_csv('/content/drive/MyDrive/stance_detection_datasets/inferences/DISARM_test_all_grounded_Qwen-VL-Chat.csv')

In [None]:
DISARM_test_all_captioned_Qwen_VL_Chat = pd.concat([DISARM_test_all_captioned_Qwen_VL_Chat, DISARM_test_all_grounded_Qwen_VL_Chat], axis=1).T.drop_duplicates().T

# Set prompts and define a function to call the model

In [None]:
def prompt_with_caption(entity, caption):
    return inspect.cleandoc(f"""
    Description of this meme: \"\"\"
    {caption}
    \"\"\"
    Does this meme harmfully target {entity}?
    yes: a social entity is subjected to some form of ill-treatment such as mental abuse, psycho-physiological injury, proprietary damage, emotional disturbance, or public image damage, based on their background (bias, social background, educational background, etc.) by a meme author.
    no: any benign mention (or depiction) of a social entity via humour, limerick, harmless pun or any content that does not cause distress.
    Constraint: Without using any other words, answer either yes or no.""")

In [None]:
# Use BLIP-2 for the inference. via
# https://huggingface.co/Salesforce/blip2-flan-t5-xxl
def get_prediction(image, prompt):
    raw_image = Image.open(image).convert('RGB')
    inputs = processor(raw_image, prompt, return_tensors="pt").to("cuda")
    out = model.generate(**inputs, max_new_tokens=30)
    return processor.decode(out[0], skip_special_tokens=True)

# Define function to clean responses

In [None]:
def remap(x):
    x = x.lower()
    if x == 'yes':
        return 'harmful'
    elif x == 'no':
        return 'not harmful'
    elif 'yes' in x and 'no' not in x:
        return 'harmful'
    elif 'no' in x and 'yes' not in x and 'not' not in x:
        return 'not harmful'
    else:
        return None

# Call the `get_prediction` function and save inferences

GROUNDING [PRESENT] & PROMPT [CAPTION] & CAPTION [Qwen_VL_Chat]

In [None]:
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat = DISARM_test_all_captioned_Qwen_VL_Chat.copy(deep=True)
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat_images = DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['image_with_grounding'].values
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat_entities = DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['target'].values
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat_captions = DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['caption'].values
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'] = [get_prediction(image, prompt_with_caption(entity, caption)) for image, entity, caption in zip(DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat_images, DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat_entities, DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat_captions)]
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat.to_csv('/content/drive/MyDrive/stance_detection_datasets/inferences/predictions/DISARM_test_all_grounding[PRESENT]_caption[Qwen-VL-Chat]_prompt[CAPTION]_prediction[BLIP-2].csv', index=False)
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'] = DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].apply(lambda x: remap(x))
print(DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].value_counts())
print(DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].isna().sum())
DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'] = DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].apply(lambda x: x if x is not None else np.random.choice(['harmful', 'not harmful']))
print(f1_score(DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['labels'].values, DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].values, labels=['harmful', 'not harmful'], average='macro'))
print(classification_report(DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['labels'].values, DISARM_test_all_grounding_present_prompt_with_caption_Qwen_VL_Chat['prediction'].values, labels=['harmful', 'not harmful']))

harmful        396
not harmful    216
Name: prediction, dtype: int64
0
0.5332909058988764
              precision    recall  f1-score   support

     harmful       0.55      0.69      0.61       316
 not harmful       0.54      0.40      0.46       296

    accuracy                           0.55       612
   macro avg       0.54      0.54      0.53       612
weighted avg       0.54      0.55      0.54       612

