In [6]:
import json
from transformers import pipeline
import pandas as pd
from tqdm import tqdm

TOKEN = ""

# Helper function to generate result path dictionaries concisely
def make_result_path_dict(model, task_dict):
    d = {"baseline1": f"output/{model}/n/baseline_1/result.jsonl"}
    for variants in task_dict.values():
        for key, subpath in variants.items():
            d[key] = f"output/{model}/n/{subpath}/result.jsonl"
    return d

# llava dictionaries
llava_sentiment_result_path_dictionary = make_result_path_dict(
    "llava",
    {"Sentiment": {
        "Negative_baseline2": "Sentiment/Negative/baseline_2",
        "Positive_baseline2": "Sentiment/Positive/baseline_2",
        "Neutral_baseline2": "Sentiment/Neutral/baseline_2",
        "Negative_our": "Sentiment/Negative/constrained_eps_32_batch_8",
        "Positive_our": "Sentiment/Positive/constrained_eps_32_batch_8",
        "Neutral_our": "Sentiment/Neutral/constrained_eps_32_batch_8",
    }}
)

llava_formality_result_path_dictionary = make_result_path_dict(
    "llava",
    {"Formality": {
        "formal_baseline2": "Formality/Formal/baseline_2",
        "informal_baseline2": "Formality/Informal/baseline_2",
        "formal_our": "Formality/Formal/constrained_eps_32_batch_8",
        "informal_our": "Formality/Informal/constrained_eps_32_batch_8",
    }}
)

llava_politics_result_path_dictionary = make_result_path_dict(
    "llava",
    {"Politics": {
        "left_baseline2": "Politics/Left/baseline_2",
        "right_baseline2": "Politics/Right/baseline_2",
        "left_our": "Politics/Left/constrained_eps_32_batch_8",
        "right_our": "Politics/Right/constrained_eps_32_batch_8",
    }}
)

llava_language_result_path_dictionary = make_result_path_dict(
    "llava",
    {"Language": {
        "English_baseline2": "Language/English/baseline_2",
        "French_baseline2": "Language/French/baseline_2",
        "Spanish_baseline2": "Language/Spanish/baseline_2",
        "English_our": "Language/English/constrained_eps_32_batch_8",
        "French_our": "Language/French/constrained_eps_32_batch_8",
        "Spanish_our": "Language/Spanish/constrained_eps_32_batch_8",
    }}
)

# minigpt4 dictionaries
minigpt4_sentiment_result_path_dictionary = make_result_path_dict(
    "minigpt4",
    {"Sentiment": {
        "Negative_baseline2": "Sentiment/Negative/baseline_2",
        "Positive_baseline2": "Sentiment/Positive/baseline_2",
        "Neutral_baseline2": "Sentiment/Neutral/baseline_2",
        "Negative_our": "Sentiment/Negative/constrained_eps_32_batch_8",
        "Positive_our": "Sentiment/Positive/constrained_eps_32_batch_8",
        "Neutral_our": "Sentiment/Neutral/constrained_eps_32_batch_8",
    }}
)

minigpt4_formality_result_path_dictionary = make_result_path_dict(
    "minigpt4",
    {"Formality": {
        "formal_baseline2": "Formality/Formal/baseline_2",
        "informal_baseline2": "Formality/Informal/baseline_2",
        "formal_our": "Formality/Formal/constrained_eps_32_batch_8",
        "informal_our": "Formality/Informal/constrained_eps_32_batch_8",
    }}
)

minigpt4_politics_result_path_dictionary = make_result_path_dict(
    "minigpt4",
    {"Politics": {
        "left_baseline2": "Politics/Left/baseline_2",
        "right_baseline2": "Politics/Right/baseline_2",
        "left_our": "Politics/Left/constrained_eps_32_batch_8",
        "right_our": "Politics/Right/constrained_eps_32_batch_8",
    }}
)

minigpt4_language_result_path_dictionary = make_result_path_dict(
    "minigpt4",
    {"Language": {
        "English_baseline2": "Language/English/baseline_2",
        "French_baseline2": "Language/French/baseline_2",
        "Spanish_baseline2": "Language/Spanish/baseline_2",
        "English_our": "Language/English/constrained_eps_32_batch_8",
        "French_our": "Language/French/constrained_eps_32_batch_8",
        "Spanish_our": "Language/Spanish/constrained_eps_32_batch_8",
    }}
)

def evaluate_result(result_path, labels, classifier):
    # Read outputs from file
    with open(result_path, 'r') as f:
        outputs = [json.loads(l)['output'] for l in f if 'output' in json.loads(l)]
    
    # Initialize tracking dicts
    scores = {label: [] for label in labels}
    label_counts = {label: 0 for label in labels}
    
    # Process each output
    for output in outputs:
        preds = classifier(output)[0]
        max_score = max(preds, key=lambda x: x['score'])
        label_counts[max_score['label']] += 1
        for pred in preds:
            if pred['label'] in labels:
                scores[pred['label']].append(pred['score'])
    
    # Calculate results
    return {
        label: {
            "label_num": round(label_counts[label]/len(outputs), 3),
            "score": round(sum(scores[label])/len(scores[label]), 3)
        }
        for label in labels
    }

models = {
    "Sentiment": {"model": "cardiffnlp/twitter-roberta-base-sentiment-latest", "label_len": 3, "labels": ["negative", "neutral", "positive"]},
    "Formality": {"model": "s-nlp/roberta-base-formality-ranker", "label_len": 2, "labels": ["formal", "informal"]},
    "Language": {"model": "papluca/xlm-roberta-base-language-detection", "label_len": 3, "labels": ["en", "es","fr"]},
    "Politics": {"model": "m-newhauser/distilbert-political-tweets", "label_len": 2, "labels": ["Republican", "Democrat"]},
}

In [3]:
# Note: Change the task_name and path_dictionary to evaluate the results for different tasks
task_name = "Sentiment"
path_dictionary = llava_sentiment_result_path_dictionary
########################################################

model = models[task_name]["model"]
labels = models[task_name]["labels"]
if not model:
    print(f"No model found for task: {task_name}")
    exit()

classifier = pipeline(model=model, top_k=None, use_auth_token=TOKEN)

# Build path dictionary for images 1-10
full_result_path_dictionary = {
    i: {k: v.replace('/n/', f'/coco_{i}/') 
        for k, v in path_dictionary.items()}
    for i in range(1, 3)
}

# Evaluate results
result_dict = {}
for i, paths in tqdm(full_result_path_dictionary.items(), desc="Evaluating images"):
    result_dict[i] = {
        name: evaluate_result(path, labels, classifier)
        for name, path in paths.items()
    }

flattened_data = []
for i, result in result_dict.items():
    for condition, labels in result.items():
        # Get the correct labels for the current task
        task_labels = models[task_name]["labels"]
        
        # Create a list with image index and condition
        row_data = [i, condition]
        
        # Add label values in the correct order
        for label in task_labels:
            row_data.append(labels[label]['label_num'])
            
        flattened_data.append(row_data)

# Create DataFrame with dynamic columns based on task labels
columns = ['i', 'Condition'] + models[task_name]["labels"]
df = pd.DataFrame(flattened_data, columns=columns)
mean_df = df.groupby('Condition').mean().reset_index()
print(mean_df)