In [3]:
import json

def load_cartoons_from_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [4]:
def get_cartoon_by_contest_number(cartoons, contest_number):
    for cartoon in cartoons:
        if cartoon.get('contest_number') == contest_number:
            return cartoon
    return None

In [8]:
import json
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

def load_json_data(path):
    """Loads JSON data from a file."""
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def get_cartoon_by_contest_number(cartoons_data, contest_number):
    """Retrieves a specific cartoon dictionary by its contest_number."""
    for cartoon in cartoons_data:
        if str(cartoon.get('contest_number')) == str(contest_number):
            return cartoon
    return None

def build_prompt_for_caption_pair(cartoon_data, caption_a_text, caption_b_text, support_caption_letter, add_true_answer_bool=False):
    """Builds the detailed prompt for analyzing a pair of captions for a given cartoon."""
    prompt_lines = [
        f"You are a cartoon analyst evaluating humor in New Yorker Caption Contest #{cartoon_data['contest_number']}.",
        "Below is a detailed description of the cartoon, including its visual scene, unusual/uncanny elements, and key observations. Your task is to act as if you're looking directly at the cartoon—not just reading about it. Reason step by step: Think step by step:",
        "1- Understand the visual setting and what makes it strange or surprising.",
        "2- Identify who is most likely speaking in the cartoon.",
        "3- Reconstruct the story or situation behind the scene—what might be going on between the characters?",
        "4- Analyze the humor in each caption: look for metaphors, cultural references, and wordplay.",
        "Finally, decide which caption is funnier, and justify your choice as if you were analyzing the cartoon visually.",
        f"Scene: {cartoon_data.get('image_location', 'unknown')}",
        f"Description: {' '.join(cartoon_data.get('image_description', []))}",
        f"Uncanny Element: {' '.join(cartoon_data.get('image_uncanny_description', []))}",
        f"Observations: {cartoon_data.get('observations', '')}",
        f"Caption A: {caption_a_text}",
        f"Caption B: {caption_b_text}",
        "Question: Based on what you \"see\" in the cartoon, which caption is funnier?",
        f"Justify your choice with detailed reasoning based on visual analysis, speaker context, and linguistic play."
    ]

    if add_true_answer_bool and 'caption' in cartoon_data:
        prompt_lines.append(f"\nYour answer should support Caption {support_caption_letter} as the funnier caption with strong reasoning.")

    return "\n".join(prompt_lines)

def analyze_cartoon_caption_pairs(cartoons_path, contest_number, add_true_answer_bool=False):
    """
    Analyzes caption pairs for a given cartoon contest number.
    For each pair, it generates a prompt, calls the LLM, and returns a list of responses.
    """
    all_cartoons_data = load_json_data(cartoons_path)

    target_cartoon = get_cartoon_by_contest_number(all_cartoons_data, contest_number)
    if not target_cartoon:
        print(f"Error: Cartoon with contest number {contest_number} not found.")
        return []

    if 'caption_choices' not in target_cartoon or not isinstance(target_cartoon.get('caption_choices'), dict):
        print(f"Error: No 'caption_choices' dictionary found for contest {contest_number}.")
        return []

    # Sort keys to process them in order
    sorted_caption_choice_keys = sorted(target_cartoon['caption_choices'].keys(), key=lambda k: int(k))

    # Store prompts and responses
    prompts = []
    llm_responses = []

    a =0
    for pair_key in sorted_caption_choice_keys:
        if a == 0:
            a+=1
            continue
        caption_pair_data = target_cartoon['caption_choices'][pair_key]

        # Validate structure
        if not (isinstance(caption_pair_data, list) and len(caption_pair_data) == 2 and
                isinstance(caption_pair_data[0], list) and len(caption_pair_data[0]) == 2 and
                isinstance(caption_pair_data[1], str) and caption_pair_data[1] in ['A', 'B']):
            print(f"Warning: Skipping malformed caption pair with key '{pair_key}'.")
            continue

        caption_A = caption_pair_data[0][0]
        caption_B = caption_pair_data[0][1]
        support_letter = caption_pair_data[1]  # This is the letter (A or B) that should be supported

        prompt = build_prompt_for_caption_pair(
            target_cartoon,
            caption_A,
            caption_B,
            support_letter,
            add_true_answer_bool
        )

        prompts.append(prompt)

        try:
            response = client.chat.completions.create(
                model="deepseek-reasoner",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                stream=False
            )
            llm_reasoning = response.choices[0].message.reasoning_content
            llm_responses.append(llm_reasoning)
        except Exception as e:
            print(f"Error calling API for pair {pair_key}: {e}")
            llm_responses.append(f"API Error for pair {pair_key}: {e}")

    return prompts, llm_responses



In [11]:
# Example usage
cartoons_file_path = 'dataset_all_upd.json'
# Load the dataset to get the length and determine the range
dataset = load_json_data(cartoons_file_path)
start_index = 261  # 260th object (0-indexed)

for i in range(start_index, len(dataset)):
    cartoon = dataset[i]
    contest_num = cartoon.get('contest_number')
    
    if contest_num is None:
        print(f"Skipping entry at index {i} - no contest number found")
        continue
        
    print(f"Processing contest {contest_num} (index {i})...")
    prompts, responses = analyze_cartoon_caption_pairs(cartoons_file_path, contest_num, add_true_answer_bool=True)
    
    if prompts and responses:  # Only process if we got valid results
        # Reload dataset to ensure we have the latest version
        dataset = load_json_data(cartoons_file_path)
        
        # Find the cartoon with the target contest number
        for cartoon in dataset:
            if cartoon.get('contest_number') == contest_num:
                # Create responses dictionary starting from index 1 (skipping 0)
                responses_dict = {}
                for idx, response in enumerate(responses):
                    responses_dict[str(idx + 1)] = response
                
                # Add the responses field to the cartoon
                cartoon['responses'] = responses_dict
                break
        
        # Write the updated dataset back to the file
        with open(cartoons_file_path, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, ensure_ascii=False, indent=2)
        
        print(f"Updated dataset with responses for contest {contest_num}")
    else:
        print(f"No valid responses for contest {contest_num}")
    
    print("="*80)
prompts, responses = analyze_cartoon_caption_pairs(cartoons_file_path, contest_num, add_true_answer_bool=True)
for i, (prompt, response) in enumerate(zip(prompts, responses)):
    print(f"Prompt {i+1}:\n{prompt}\n")
    print(f"Response {i+1}:\n{response}\n")
    print("="*80)
    # Load the dataset and update it with responses
    if i == len(prompts) - 1:  # Only execute this once after all prompts are processed
        dataset = load_json_data(cartoons_file_path)
        
        # Find the cartoon with the target contest number
        for cartoon in dataset:
            if cartoon.get('contest_number') == contest_num:
                # Create responses dictionary starting from index 1 (skipping 0)
                responses_dict = {}
                for idx, response in enumerate(responses):
                    responses_dict[str(idx + 1)] = response
                
                # Add the responses field to the cartoon
                cartoon['responses'] = responses_dict
                break
        
        # Write the updated dataset back to the file
        with open(cartoons_file_path, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, ensure_ascii=False, indent=2)
        
        print(f"Updated dataset with responses for contest {contest_num}")

Processing contest 658 (index 261)...
Updated dataset with responses for contest 658
Processing contest 22 (index 262)...
Updated dataset with responses for contest 22
Processing contest 182 (index 263)...
Updated dataset with responses for contest 182
Processing contest 178 (index 264)...
Updated dataset with responses for contest 178
Processing contest 79 (index 265)...
Updated dataset with responses for contest 79
Processing contest 688 (index 266)...
Updated dataset with responses for contest 688
Processing contest 166 (index 267)...
Updated dataset with responses for contest 166
Processing contest 528 (index 268)...
Updated dataset with responses for contest 528
Processing contest 495 (index 269)...
Updated dataset with responses for contest 495
Processing contest 224 (index 270)...
Updated dataset with responses for contest 224
Processing contest 415 (index 271)...
Updated dataset with responses for contest 415
Processing contest 223 (index 272)...
Updated dataset with responses 