In [50]:
event_arg = {
    "life.die":["agent", "victim", "instrument", "place"],
    "movement.transport":["destination", "origin", "instrument", "agent", "artifact/person"],
    "transaction.transfermoney":["giver","recipient", "money"],
    "conflict.attack":["attacker", "instrument", "place", "target"],
    "conflict.demonstrate":["demonstrator", "instrument", "police", "place"],
    "contact.meet":["participant", "place"],
    "contact.phone-write":["participant", "instrument", "place"],
    "justice.arrestjail":["agent", "person", "instrument", "place"]
}

In [51]:
ontology_tips = {
    "life.die": "it occurs when the life of a person entity ends",
    "movement.transport":" it occurs when an artifact or a person is moved from one place to another",
    "transaction.transfermoney":"it refers to the giving, receiving, borrowing, or lending money",
    "conflict.attack":"a violent physical act causing harm or damage",
    "conflict.demonstrate":"it occurs when a large number of people come together in a public area to protest or demand some sort of official action",
    "contact.meet":" it occurs when two or more people interact with one another face-to-face at a single location",
    "contact.phone-write":"it occurs when two or more people directly engage in discussion but not face-to-face",
    "justice.arrestjail":"it occurs when the movement of a person is constrained by a state actor"
}

In [None]:
import os
import shutil
import json
import spacy

nlp = spacy.load("en_core_web_sm")


file_name = "VOA_EN_NW_2016.12.27.3653223"  # change this name for each file
input_path = os.path.join("m2e2_rawdata", "article", file_name + ".rsd.txt")

with open(input_path, "r", encoding="utf-8") as f:
    text = f.read()


doc = nlp(text)

sentences = [sent.text.strip() for sent in doc.sents]

#sentences = [s.strip() for s in text.split('. ') if s.strip()]
#sentences = sent_tokenize(text)


#####################################
output_root = "golden_output"
article_out = os.path.join(output_root, "article")
image_out = os.path.join(output_root, "image")

# ---- Clean the output dirs each time ----
if os.path.exists(article_out):
    shutil.rmtree(article_out)
os.makedirs(article_out)

if os.path.exists(image_out):
    shutil.rmtree(image_out)
os.makedirs(image_out)

# ---- Reinitialize metadata files ----
with open(f"{output_root}/text_only.json", "w", encoding="utf-8") as f:
    f.write("")

with open(f"{output_root}/image_only.json", "w", encoding="utf-8") as f:
    f.write("")

with open(f"{output_root}/corref.txt", "w", encoding="utf-8") as f:
    f.write("")

###########################################
##### IMAGE COPYING AND CAPTIONS JSON #####
###########################################
src_image_dir = os.path.join("m2e2_rawdata", "image", "image")

for img_file in os.listdir(src_image_dir):
    if img_file.startswith(file_name + "_"):
        src = os.path.join(src_image_dir, img_file)
        dst = os.path.join(image_out, img_file)
        shutil.copy2(src, dst)
        print(f"Copied: {img_file}")

print("All related images copied.")

output_file = os.path.join(article_out, file_name)

caption_file = os.path.join("m2e2_rawdata", "image", "image_url_caption.json")
captions_list = []
url_list = []

with open(caption_file, "r", encoding="utf-8") as f:
    data = json.load(f)

if file_name in data:
    # Sort by index (0,1,2,...) to keep correct order
    for idx in sorted(data[file_name].keys(), key=lambda x: int(x)):
        captions_list.append(data[file_name][idx]["caption"])
        url_list.append(data[file_name][idx]["url"])
else:
    print(f"No captions found for {file_name}")

########################################
with open(output_file, "w", encoding="utf-8") as f:
    for sentence in sentences:
        f.write(sentence + "\n")

print(f"Processed file saved at: {output_file}")


Copied: VOA_EN_NW_2015.11.26.3074855_0.jpg
Copied: VOA_EN_NW_2015.11.26.3074855_1.jpg
Copied: VOA_EN_NW_2015.11.26.3074855_2.jpg
All related images copied.
Processed file saved at: golden_output\article\VOA_EN_NW_2015.11.26.3074855


# Text-Only Extraction

In [58]:
example_json = [
    {
        "sentence_id": "VOA_EN_NW_2015.14.05.21651_1",

        "sentence": " New Jersey Maintenance Workers Among Powerball Winners A group of 16 maintenance workers from a county government garage in New Jersey share one of three winning tickets in the $448 million Powerball lottery jackpot, their boss said on Friday.",

        "trigger": {
            "start": 8,
            "end": 9,
            "text": "custody"
        },
        
        
        "event_type": "Justice:Arrest-Jail",
        
        
        "arguments": [
            {
                "role": "Person",
                "start": 6,
                "end": 7,
                "text": "man"
            },
            {
                "role": "Agent",
                "start": 3,
                "end": 4,
                "text": "officers"
            }
        ]
    }
]

In [None]:
import ollama
import ast
import json

all_data = []

for i in range(0, len(sentences)):

    prompt = f"""
        you are an information extraction system. Extract events and arguments for the following sentence: "{sentences[i]}"

        RULES:
        1. Only use the following ontology (event types and their argument roles): {event_arg}
        Here are some helpful tips to determine when to use which event: {ontology_tips}  
        2. Output **must be a SINGLE valid JSON object - NOT an array** only, nothing else. **Do not add markdown, backslashes, escape characters, or extra text**.
        3. Only use double quotation marks (") for JSON strings.
        4. If you determine that an event has occurred, then each event must include (these cannot be empty):
            a. "sentence_id": "{file_name}_{i}"
            b. "sentence": the sentence itself"
            c. "trigger": the word that signal the event, with "text", "start" (word count where it occurred), "end" (word count where it ended - start+1).
            d. "event_type": the event type string from onotlogy.
            e. "arguments": a list of objects (make sure you can estimate as many argument roles from ontology as possible - for ex, conflict.attack has 4), each with:
                i. "role": role from the ontology - a single role only.
                ii. "text": the argument string - a single word only.
                iii. "start": which word count the word occurred at - starting from 0.
                iv. "end": end word count - which is just start+1.
        5. If you determine an event has not occurred or if the sentence is purely descriptive, background, or connective text, i.e, it dosent correspond to any annotated event, then output an empty json.
        5. Do **NOT** output anything other than the JSON.
        6. **Do not include explanations, notes, comments, or any text outside the JSON array.**
        7. Your output needs to only be for a single sentence id, and a single event based on the sentence. Do NOT output more than 1 sentence id.

        Here is an example output, your output should follow this exact JSON format: {example_json}
        """

    response = ollama.chat( 
        model="gemma3:12b", 
        messages=[{"role": "user", "content": prompt}] 
        )

    raw_text = response["message"]["content"]

    cleaned_text = raw_text.strip("```json\n").strip("```\n")

    try:
        # Try JSON first (preferred, safer than ast)
        python_obj = json.loads(cleaned_text)
    except Exception as e1:
        try:
            # Fallback: use ast.literal_eval for Python-like dicts
            python_obj = ast.literal_eval(cleaned_text)
        except Exception as e2:
            print(f"[WARNING] Skipping sentence {i}: could not parse response")
            print("Raw response was:", raw_text[:200], "...\n")  # preview first 200 chars
            continue  # skip to next sentence


    all_data.append(python_obj)


    print(json.dumps(python_obj, indent=2))

    with open("golden_output/text_only.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2)



{
  "sentence_id": "VOA_EN_NW_2015.11.26.3074855_1",
  "sentence": "A Second Night of Chicago Protests Over Police Shooting Protesters took to the streets of Chicago for the second straight night Wednesday, one day after the city's police department released video of a black teenager being shot to death by a white officer.",
  "trigger": {
    "text": "Protests",
    "start": 5,
    "end": 6
  },
  "event_type": "conflict.demonstrate",
  "arguments": [
    {
      "role": "demonstrator",
      "text": "Protesters",
      "start": 9,
      "end": 10
    },
    {
      "role": "police",
      "text": "Police",
      "start": 7,
      "end": 8
    },
    {
      "role": "place",
      "text": "the streets of Chicago",
      "start": 12,
      "end": 16
    }
  ]
}
{
  "sentence_id": "VOA_EN_NW_2015.11.26.3074855_2",
  "sentence": "The protesters rallied in downtown Chicago, following the release of video that showed officer Jason Van Dyke shooting 17-year-old Laquan McDonald 16 times in O

In [63]:
import json 

with open("golden_output/text_only.json", "r", encoding="utf-8") as f:
    data = json.load(f)

cleaned_data = [obj for obj in data if obj]

with open("golden_output/text_only.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print("Cleaned JSON saved to cleaned_data.json")

Cleaned JSON saved to cleaned_data.json


# Image-Only Extraction

In [64]:
img_example_json = [
    {
        "image_id": "VOA_EN_NW_2015.11.26.3074855_1",

        "caption": "Two Chicago police officers take a man into custody during a protest march, Wednesday, Nov. 25, 2015, in Chicago, the day after murder charges were brought against police officer Jason Van Dyke in the killing of 17-year-old Laquan McDonald.",
        
        "event_type": "Justice:Arrest-Jail",
        
        "role": {
            "Person": [
                [
                    "1",
                    660,
                    214,
                    1366,
                    1630
                ]
            ],
            "Agent": [
                [
                    "1",
                    250,
                    191,
                    846,
                    1630
                ],
                [
                    "1",
                    1212,
                    240,
                    1782,
                    1630
                ]
            ]
        }
    }
]

In [None]:
import ollama
import ast
import json
import requests
import os

image_folder = "golden_output/image"
all_data = []

image_files = sorted([f for f in os.listdir(image_folder) if f.lower().endswith((".jpg", ".jpeg", ".png"))])


for i, image_file in enumerate(image_files):
    image_path = os.path.join(image_folder, image_file)

    prompt = f"""
    You are an event extraction system. Analyze the image called {file_name}_{i} (image id) and extract events.

    RULES:
    1. Only use the following event types and their roles: {event_arg}
    2. Here are some helpful tips to determine when to use which event: {ontology_tips}
    3. Use the following caption accompanied with the image for additional information: {captions_list[i]}
    4. Output **must be a SINGLE valid JSON object - NOT an array** only, nothing else. **Do not add markdown, backslashes, escape characters, or extra text**.
    5. Only use double quotation marks (") for JSON strings.
    6. If you determine that an event has occurred, then each event must include (these cannot be empty):
    - "image_id": the image id provided above.
    - "caption": the provided caption.
    - "event_type": the event type string from onotlogy.
    - "role": these should contain additional roles from ontology and the respective bounding box values (look at example)
    7. If you determine that an event has not occurred, then output an empty json.
    8. Do **NOT** output anything other than the JSON.
    9. **Do not include explanations, notes, comments, or any text outside the JSON array.**
    10. Your output needs to only be for a single image id, and a single event based on the image. Do NOT output more than 1 image id.

    Here is an example output, your output should follow this exact JSON format: {img_example_json}
    """

    response = ollama.chat(
        model="gemma3:12b",
        messages=[
            {
                "role": "user",
                "content": prompt,
                "images": [image_path]  
            }
        ]
    )

    raw_text = response["message"]["content"]

    cleaned_text = raw_text.strip("```json\n").strip("```\n")

    try:
        # Try JSON first (preferred, safer than ast)
        python_obj = json.loads(cleaned_text)
    except Exception as e1:
        try:
            # Fallback: use ast.literal_eval for Python-like dicts
            python_obj = ast.literal_eval(cleaned_text)
        except Exception as e2:
            print(f"[WARNING] Skipping sentence {i}: could not parse response")
            print("Raw response was:", raw_text[:200], "...\n")  # preview first 200 chars
            continue  # skip to next sentence


    all_data.append(python_obj)


    print(json.dumps(python_obj, indent=2))

    with open("golden_output/image_only.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2)




{
  "image_id": "VOA_EN_NW_2015.11.26.3074855_0",
  "caption": "Lamon Reccord, second from right, yells at a Chicago police officer \"Shoot me 16 times\" as he and others march through Chicago's Loop Wednesday, Nov. 25, 2015.",
  "event_type": "conflict.demonstrate",
  "role": {
    "demonstrator": [
      [
        "200",
        "450",
        "900",
        "700"
      ]
    ],
    "place": [
      [
        "0",
        "0",
        "999",
        "999"
      ]
    ],
    "police": [
      [
        "600",
        "600",
        "999",
        "999"
      ]
    ]
  }
}
{
  "image_id": "VOA_EN_NW_2015.11.26.3074855_1",
  "caption": "Two Chicago police officers take a man into custody during a protest march, Wednesday, Nov. 25, 2015, in Chicago, the day after murder charges were brought against police officer Jason Van Dyke in the killing of 17-year-old Laquan McDonald.",
  "event_type": "justice.arrestjail",
  "role": {
    "agent": [
      [
        250,
        191,
        846,
 

In [None]:
import json

with open("golden_output/image_only.json", "r", encoding="utf-8") as f:
    data = json.load(f)

flattened_data = []
for item in data:
    if isinstance(item, list):
        flattened_data.extend(item)
    else:
        flattened_data.append(item)

cleaned_data = [obj for obj in flattened_data if obj]

with open("golden_output/image_only.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print("Cleaned and flattened JSON saved to golden_output/image_only.json")

# Cross Media Alignment

In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load JSON data
with open("golden_output/text_only.json", "r", encoding="utf-8") as f:
    text_data = json.load(f)

with open("golden_output/image_only.json", "r", encoding="utf-8") as f:
    image_data = json.load(f)


def load_or_init_json(path):
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                return []
    return []

text_out_path = "golden_output/text_multimedia_event.json"
image_out_path = "golden_output/image_multimedia_event.json"
corref_out_path = "golden_output/corref.txt"

text_out_data = load_or_init_json(text_out_path)
image_out_data = load_or_init_json(image_out_path)


# Collect all sentences and captions to fit TF-IDF
corpus = [item["sentence"] for item in text_data if "sentence" in item] + \
         [item["caption"] for item in image_data if item and "caption" in item]

vectorizer = TfidfVectorizer().fit(corpus)

# Open output file
with open(corref_out_path, "a", encoding="utf-8") as out_file:
    for text_item in text_data:
        sentence_id = text_item.get("sentence_id")
        sentence = text_item.get("sentence", "")
        text_event_type = text_item.get("event_type", "")

        sentence_vec = vectorizer.transform([sentence])

        for image_item in image_data:
            if not image_item or "caption" not in image_item:
                continue

            image_id = image_item.get("image_id")
            caption = image_item.get("caption", "")
            image_event_type = image_item.get("event_type", "")

            # Skip if event_type doesn't match
            if text_event_type != image_event_type:
                continue

            caption_vec = vectorizer.transform([caption])
            similarity = cosine_similarity(sentence_vec, caption_vec)[0][0]

            print(f"Sentence: {sentence}")
            print(f"Caption: {caption}")
            print(f"Similarity: {similarity}\n")

            if similarity >= 0.5:
                out_file.write(f"{sentence_id}     {image_id}     {text_event_type}         {similarity}\n")
                
                if text_item not in text_out_data:
                    text_out_data.append(text_item)
                
                if image_item not in image_out_data:
                    image_out_data.append(image_item)


with open("golden_output/text_multimedia_event.json", "w", encoding="utf-8") as f:
    json.dump(text_out_data, f, indent=4, ensure_ascii=False)

with open("golden_output/image_multimedia_event.json", "w", encoding="utf-8") as f:
    json.dump(image_out_data, f, indent=4, ensure_ascii=False)

Sentence:  A Second Night of Chicago Protests Over Police Shooting Protesters took to the streets of Chicago for the second straight night Wednesday, one day after the city's police department released video of a black teenager being shot to death by a white officer.
Caption:  Lamon Reccord, second from right, yells at a Chicago police officer "Shoot me 16 times" as he and others march through Chicago's Loop Wednesday, Nov. 25, 2015.
Similarity:  0.4918234348297119



Sentence:  The protesters rallied in downtown Chicago, following the release of video that showed officer Jason Van Dyke shooting 17-year-old Laquan McDonald 16 times in October 2014.
Caption:  Lamon Reccord, second from right, yells at a Chicago police officer "Shoot me 16 times" as he and others march through Chicago's Loop Wednesday, Nov. 25, 2015.
Similarity:  0.4033973217010498



Sentence:  Van Dyke was arrested Tuesday and charged with murder.
Caption:  Two Chicago police officers take a man into custody during a p