In [None]:
import torch
from transformers import IdeficsForVisionText2Text, AutoProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"

checkpoint = "HuggingFaceM4/idefics-9b-instruct"
model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
processor = AutoProcessor.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/99.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/7.89G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def check_inference(model, processor, prompts, max_new_tokens=10):
    tokenizer = processor.tokenizer
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    inputs = processor(prompts, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

## Run Inference on 1k_batch

In [None]:
import pandas as pd
from tqdm import tqdm
import time
import numpy as np
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Load Sample Dataset
data = '/content/drive/My Drive/EC_Project/data/1k_batch.csv'
sample_df = pd.read_csv(data)
sample_df = sample_df.sample(frac=1).reset_index(drop=True) # shuffle the batch
sample_df = sample_df.head(100)

In [None]:
# Iterate through each URL and run inference
results = []
start_time = time.time()

tqdm.pandas()

for index, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
  utterance = row['utterance']
  url = row['Link']
  prompts = [
    [
        "User: The fact that there is nothing makes me frustrated. There's nothing here, just a blank canvas that lacks any sort of imagination or creativity. Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger, Disgust.",
        "https://uploads4.wikiart.org/images/robert-ryman/convert-1988.jpg",
        "<end_of_utterance>",
        "\nAssistant: Anger.<end_of_utterance>",
        "\nUser:",
        url,
        f"{utterance} Choose one emotion from: Amusement, Awe, Contentment, Excitement, Fear, Sadness, Anger, Disgust.",
        "\nAssistant:",

    ],
]
  generated_text = check_inference(model, processor, prompts)
  print(generated_text)
  results.append(generated_text)

end_time = time.time()
elapsed_time = end_time - start_time

sample_df['predicted_anger_shot'] = results

NameError: name 'time' is not defined

## Clean up the results

In [None]:
def extract_second_assistant_emotion(text):
    # Find all occurrences of "Assistant: " in the text
    parts = re.split(r'Assistant: ', text)

    # If there are at least two occurrences
    if len(parts) > 2:
        # Extract the part after the second "Assistant: "
        second_assistant_part = parts[2]

        # Find the first emotion word from the specified list
        emotions = ['Amusement', 'Awe', 'Contentment', 'Excitement', 'Fear', 'Sadness', 'Anger', 'Disgust']
        for emotion in emotions:
            if emotion in second_assistant_part:
                return emotion

    # Return None if no emotion is found
    return None

In [None]:
sample_df['predicted_anger_shot'] = sample_df['predicted_anger_shot'].apply(extract_second_assistant_emotion)

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [None]:
# Drop rows where 'generated_emotion' or 'emotion' is NaN
sample_df = sample_df.dropna(subset=['predicted_anger_shot', 'emotion'])

In [None]:
sample_df['predicted_anger_shot'] = sample_df['predicted_anger_shot'].str.lower()

In [None]:
accuracy = accuracy_score(sample_df['emotion'], sample_df['predicted_anger_shot'])
f1 = f1_score(sample_df['emotion'], sample_df['predicted_anger_shot'], average='weighted',zero_division=0)
precision = precision_score(sample_df['emotion'], sample_df['predicted_anger_shot'], average='weighted',zero_division=0)
recall = recall_score(sample_df['emotion'], sample_df['predicted_anger_shot'], average='weighted',zero_division=0)
report = classification_report(sample_df['emotion'], sample_df['predicted_anger_shot'], output_dict=True,zero_division=0)

conf_matrix = confusion_matrix(sample_df['emotion'], sample_df['predicted_anger_shot'])
class_accuracies = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

for class_label, class_accuracy in zip(report.keys(), class_accuracies):
    if class_label != 'accuracy':
        report[class_label]['accuracy'] = class_accuracy

report_df = pd.DataFrame(report).transpose().round(2)
report_str = report_df.to_string()

# 打印结果
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Classification Report:")
print(report_str)


Accuracy: 0.76
F1 Score: 0.7842226523697111
Precision: 0.8900193548387096
Recall: 0.76
Classification Report:
              precision  recall  f1-score  support  accuracy
amusement          0.48    1.00      0.65    12.00      1.00
anger              0.00    0.00      0.00     0.00       NaN
awe                1.00    0.52      0.69    23.00      0.52
contentment        0.94    0.83      0.88    35.00      0.83
disgust            0.50    1.00      0.67     5.00      1.00
excitement         1.00    0.60      0.75     5.00      0.60
fear               1.00    0.64      0.78    11.00      0.64
sadness            1.00    0.89      0.94     9.00      0.89
accuracy           0.76    0.76      0.76     0.76      0.76
macro avg          0.74    0.68      0.67   100.00       NaN
weighted avg       0.89    0.76      0.78   100.00       NaN


  class_accuracies = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
