In [1]:
from datasets import load_dataset
from PIL import Image
import io
ds = load_dataset("jmhessel/newyorker_caption_contest", "explanation")
train_data = ds["train"]



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(train_data.features)

{'image': Image(mode=None, decode=True, id=None), 'contest_number': Value(dtype='int32', id=None), 'image_location': Value(dtype='string', id=None), 'image_description': Value(dtype='string', id=None), 'image_uncanny_description': Value(dtype='string', id=None), 'entities': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'questions': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'caption_choices': Value(dtype='string', id=None), 'from_description': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None), 'n_tokens_label': Value(dtype='int32', id=None), 'instance_id': Value(dtype='string', id=None)}


In [3]:
def entity_extractor(entity_list: list):
    entities = []
    for entity in entity_list:
        entity_name = entity.split("/")[-1]
        entities.append(entity_name)
    return entities



In [4]:
def get_by_contest_number(contest_number : int, train_data):
    subset = train_data.filter(lambda e : e["contest_number"] == contest_number)
    return subset


In [None]:
import base64
def image_to_bytes(img):
    buffered = io.BytesIO()
    img.save(buffered, format = "PNG")
    img_bytes = buffered.getvalue()
    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
    return img_b64

In [None]:



def get_input(row):

    img = row["image"]
    desc = row["image_description"]
    number = row["contest_number"]
    print(number)
    loc = row["image_location"]
    uncan = row["image_uncanny_description"]
    entities = entity_extractor(row["entities"])

    input_text = (
        f"Contest Number: {number} \n" + 
        "Description: " + desc + "\n" +
        "Location: " + loc + "\n" +
        "Uncanny description: " + uncan + "\n" +
        "Entities: " 
        + ", ".join(entities)
    )

    return input_text, img

In [None]:


train_data["image"][4].save("image_cache/image_0.jpg")
print(train_data["image_description"][4])
print(train_data["contest_number"][4])
print(entity_extractor(train_data["entities"][4]))
print(train_data["image_location"][4])
print(train_data["image_uncanny_description"][4])
print(train_data["caption_choices"][4])

3 baseball players appear to be playing baseball in a living room.  A fourth person sits on a coach behind them, watching and talking on the phone
711
['Baseball', 'Telephone_call', 'Obliviousness']
the living room
It's unusual to see games of baseball being played inside houses
All his pitches have been inside.


In [64]:
from openai import OpenAI
from dotenv import load_dotenv
import os


model = "gpt-4o-mini"

instructions = (
    "Given the description, extracted entities, location, uncanny description, and the image provided below, "
    "identify key figures, characters, location or the described situation in the cartoon. For each, generate:\n"
    "- 5 synonyms\n"
    "- 5 antonyms or contraries\n"
    "- 5 word groups that include the item (e.g., paper → paper cut)\n"
    "- 5 things that evoke a similar feeling or vibe (be creative)\n"
    "- 5 idioms (e.g., dog → raining cats and dogs)\n"
    "- 5 widely recognizable references from general culture (e.g., rabbit → Aesop’s fables)\n\n"
    "Do not exaggerate the general culture references—keep them specific and broadly recognizable. "
    "Avoid overly generic entries.\n\n"

)

format_instruction = (
    "Return the result strictly as a JSON object. For each entity name and the central situation, use it as the object key. "
    "Include the contest number provided as the first key of the object"
    "Your response MUST include all the named entities, location, and the depicted action, event or situation in the cartoon."
    "Under each, include keys: synonyms, antonyms, word_groups, same_vibe, idioms, cultural_references. "
    "Example format for a cartoon in which a cat is drinking hot sauce in a kitchen:\n"
    "{\n"
    ' "contest number": 321'
    '  "cat": {\n'
    '    "synonyms": [...],\n'
    '    "antonyms": [...],\n'
    '    "word_groups": [...],\n'
    '    "same_vibe": [...],\n'
    '    "idioms": [...],\n'
    '    "cultural_references": [...]\n'
    '  },\n'
    '  "hot sauce": { ... }\n'
    '  "drinking hot sauce: { .... }'
    '  "kitchen: {.....} '
    "}"
)





def get_response(input_text, img, client):
    response = client.responses.create(
        model = model,
        input =[
            {
            "role" : "user",
            "content" : [
                {"type" : "input_text", "text" : instructions +  "\n" + format_instruction},
                {"type" : "input_text", "text" : input_text },
                {"type" : "input_image" , "image_url" : f"data:image/png;base64,{image_to_bytes(img)}" },
                
                ],
            }
        ],
    )

    return response.output[0].content[0].text


In [70]:
import json
from pathlib import Path
from tqdm import tqdm
results = []

load_dotenv()
api_key=os.getenv("OPENAI_API_KEY")
print(api_key)
client = OpenAI(api_key=api_key)

for row in tqdm(train_data.select(range(0,3))):
    (input_text, img)  = get_input(row)
    raw  = get_response(input_text,img,client)
    cleaned = raw.strip("```json\n").rstrip("```")
    parsed = json.loads(cleaned)
    results.append(parsed)


Path("responses.json").write_text(json.dumps(results, indent=2))


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


  0%|          | 0/3 [00:00<?, ?it/s]

130


  0%|          | 0/3 [00:04<?, ?it/s]


KeyboardInterrupt: 