IDEFICS Notebook: https://github.com/huggingface/notebooks/blob/main/examples/idefics/finetune_image_captioning_peft.ipynb

## Install and import necessary libraries

In [None]:
!pip install tqdm
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
import torchvision.transforms as transforms
from tqdm import tqdm
import time
import pandas as pd

### Load quantized model

In [3]:
# Huggingface login token - CHANGE TO YOUR TOKEN!
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

checkpoint = "HuggingFaceM4/idefics-9b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

processor = AutoProcessor.from_pretrained(checkpoint, use_auth_token=True)
# Take-off the quantization_config arg if you want to load the original model
model = IdeficsForVisionText2Text.from_pretrained(checkpoint, quantization_config=bnb_config, device_map="auto")

### Inference

In [5]:
def check_inference(model, processor, prompts, max_new_tokens=50):
    tokenizer = processor.tokenizer
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    inputs = processor(prompts, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=False)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

In [6]:
# Test inference with zero-shot prompting
url = "https://uploads7.wikiart.org/images/aaron-siskind/acolman-1-1955.jpg"
prompts = [
    # "Instruction: provide an answer to the question. Use the image to answer.\n",
    url,
    "Question: Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger. Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=5)

Question: Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger. Answer: Fear.


In [7]:
url = "https://uploads1.wikiart.org/images/alexandre-benois/peter-i-on-a-walk-in-the-summer-garden(1).jpg"
prompts = [
    # "Instruction: provide an answer to the question. Use the image to answer.\n",
    url,
    "Question: Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger. Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=5)

Question: Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger. Answer: Sadness.


In [9]:
# Load preprocessed dataset
file_path = '/content/final_data.csv'  # Update this to the correct path to your CSV file
data = pd.read_csv(file_path)

In [None]:
# Iterate through each URL and run inference
results = []
start_time = time.time()

tqdm.pandas()

for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    url = row['Link']
    prompt = f"Instruction: provide an answer to the question. Use the image to answer.\n{url}\nQuestion: Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger. Answer:"
    generated_text = check_inference(model, processor, [prompt])
    results.append(generated_text)

end_time = time.time()
elapsed_time = end_time - start_time

In [None]:
# Add the results to the DataFrame
data['generated_emotion'] = results