In [3]:
import uuid
from utils import read_file, write_file
from loguru import logger
data = read_file("../submit_data/raw_data_batch3.json")
for tpc in data:
  for doc in tpc.get("docs", []):
    if "uuid" not in doc:
      logger.warning(f"Document without uuid: {tpc['topic_id']}")
      doc["uuid"] = str(uuid.uuid4())
  for doc in tpc.get("dis_T", []):
    if "uuid" not in doc:
      logger.warning(f"Document without uuid: {tpc['topic_id']}")
      doc["uuid"] = str(uuid.uuid4())
write_file(data, "../submit_data/raw_data_batch3.json")



In [4]:
from utils import read_jsonl, read_file, write_file
from loguru import logger
def pair_summary(raw_data_path: str, summary_data_path: str) -> None:
  raw_data = read_file(raw_data_path)
  summary_data = read_jsonl(summary_data_path) # {'uuid': '...', 'summary': '...'}
  uuid2summary = {d['uuid']: d['answer'] for d in summary_data}
  cnt = 0
  for tpc in raw_data:
    for doc in tpc['docs']:
      if doc['uuid'] in uuid2summary:
        doc['summary'] = uuid2summary[doc['uuid']]
        cnt += 1
    for doc in tpc['dis_T']:
      if doc['uuid'] in uuid2summary:
        doc['summary'] = uuid2summary[doc['uuid']]
        cnt += 1
  write_file(raw_data, raw_data_path)
  logger.info(f"Pairing {cnt} summaries with raw data.")
pair_summary('/Users/ymx66/Workspace/why-it-happened/submit_data/raw_data_batch3.json', '/Users/ymx66/Workspace/why-it-happened/submit_data/smy_batch3.jsonl')


[32m2025-07-14 15:03:08.432[0m | [1mINFO    [0m | [36m__main__[0m:[36mpair_summary[0m:[36m18[0m - [1mPairing 611 summaries with raw data.[0m


In [3]:
def pair_events(raw_data_path: str, events_data_path: str) -> None:
  raw_data = read_file(raw_data_path)
  events_data = read_jsonl(events_data_path) # {'uuid': '...', 'answer': '...'}
  uuid2events = {d['uuid']: d['answer'] for d in events_data}
  cnt = 0
  for tpc in raw_data:
    for doc in tpc['docs']:
      if doc['uuid'] in uuid2events:
        doc['events'] = uuid2events[doc['uuid']]
        cnt += 1
    for doc in tpc['dis_T']:
      if doc['uuid'] in uuid2events:
        doc['events'] = uuid2events[doc['uuid']]
        cnt += 1
  write_file(raw_data, raw_data_path)
  logger.info(f"Pairing {cnt} events with raw data.")
pair_events('/Users/ymx66/Workspace/why-it-happened/submit_data/raw_data.json', '/Users/ymx66/Workspace/why-it-happened/submit_data/events.jsonl')
                       

[32m2025-07-11 14:57:19.974[0m | [1mINFO    [0m | [36m__main__[0m:[36mpair_events[0m:[36m16[0m - [1mPairing 56 events with raw data.[0m


In [4]:
from utils import write_line
import uuid
def make_pairs_from_timeline(timline_path: str, raw_data_path: str, output_path: str) -> None:
  raw_data = read_file(raw_data_path)
  data = read_jsonl(timline_path)

  for d in data:
    topic_id = d["topic_id"]
    topic = d["topic"]
    timeline = d["answer"]
    # ori_data = raw_data[topic_id - 1]
    ori_data = next(
        (item for item in raw_data if item["topic_id"] == topic_id), None
    )

    for i in range(1, len(timeline)):
      event2 = timeline[i]["event"]
      pos2 = timeline[i]["position"]
      event2_context = [
          doc["summary"] for doc in ori_data["docs"] if doc["position"] in pos2
      ]

      for j in range(i):
        event1 = timeline[j]["event"]
        pos1 = timeline[j]["position"]
        event1_context = [
          doc["summary"]
          for doc in ori_data["docs"]
          if doc["position"] in pos1
        ]

        new_d = {
          "topic_id": topic_id,
          "topic": topic,
          "uuid": str(uuid.uuid4()),
          "event1": event1,
          "event1_context": event1_context,
          "event1_order": timeline[j]["event_order"],
          "event2": event2,
          "event2_context": event2_context,
          "event2_order": timeline[i]["event_order"],
        }

        write_line(new_d, output_path)
make_pairs_from_timeline('../submit_data/timeline.jsonl', '../submit_data/raw_data.json', '../submit_data/event_pairs.jsonl')

In [10]:
from glob import glob
from utils import read_jsonl, write_jsonl
from collections import defaultdict
import statistics

def group_annotation_data(input_path: str, raw_data_path: str) -> tuple[list[dict], list[dict]]:
    files = glob(input_path + '/*.jsonl')
    uuid2scores_by_model = defaultdict(lambda: {'gpt': None, 'gemini': None, 'claude': None})
    uuid2reasoning_by_model = defaultdict(lambda: {'gpt': None, 'gemini': None, 'claude': None})
    uuid2all_scores = defaultdict(list)

    for file in files:
        if 'gpt' in file:
            model_name = 'gpt'
        elif 'gemini' in file:
            model_name = 'gemini'
        elif 'claude' in file:
            model_name = 'claude'
        else:
            continue
        data = read_jsonl(file)
        for d in data:
            uuid = d['uuid']
            score = d['score']
            reasoning = d['reasoning']
            uuid2scores_by_model[uuid][model_name] = score
            uuid2reasoning_by_model[uuid][model_name] = reasoning
            uuid2all_scores[uuid].append(score)

    uuid2var = {}
    uuid2avg = {}
    for uuid, scores in uuid2all_scores.items():
        if len(scores) > 1:
            var = statistics.variance(scores)
        else:
            var = 0.0
        avg = statistics.mean(scores)
        uuid2var[uuid] = var
        uuid2avg[uuid] = avg

    def get_interval(score: float) -> int:
        if 0 <= score < 40:
            return 1
        elif 40 <= score < 60:
            return 2
        elif 60 <= score <= 100:
            return 3
        else:
            return -1

    first_class = []
    second_class = []

    raw_data = read_jsonl(raw_data_path)
    for d in raw_data:
        uuid = d['uuid']
        model_scores = uuid2scores_by_model.get(uuid, {})
        d['gpt_score'] = model_scores.get('gpt')
        d['gpt_reasoning'] = uuid2reasoning_by_model.get(uuid, {}).get('gpt')
        d['gemini_score'] = model_scores.get('gemini')
        d['gemini_reasoning'] = uuid2reasoning_by_model.get(uuid, {}).get('gemini')
        d['claude_score'] = model_scores.get('claude')
        d['claude_reasoning'] = uuid2reasoning_by_model.get(uuid, {}).get('claude')
        d['var'] = uuid2var.get(uuid)
        d['avg'] = uuid2avg.get(uuid)

        scores = [model_scores['gpt'], model_scores['gemini'], model_scores['claude']]
        intervals = {get_interval(s) for s in scores if s is not None}

        if d['var'] < 500 or len(intervals) == 1:
            first_class.append(d)
        else:
            second_class.append(d)

    return first_class, second_class
f, s = group_annotation_data('../submit_data/annotation_data', '../submit_data/event_pairs.jsonl')
print(s[0])
len(f), len(s)



(675, 167)

In [11]:
write_jsonl(f, '../submit_data/to_be_mcq.jsonl')
write_jsonl(s, '../submit_data/to_be_verify.jsonl')