In [None]:
import sys
from pathlib import Path

# ノートブックのカレントディレクトリを確認して、project root を推定する
print("cwd:", Path.cwd())
proj_root = Path.cwd().parents[1]  # notebook/eda -> ../.. がリポジトリルート
print("project root guessed:", proj_root)

# 先頭に挿入して import の優先検索先にする
sys.path.insert(0, str(proj_root))

# 動作確認
import importlib

try:
    import src.paths as _p

    importlib.reload(_p)
    print("src is importable:", _p)
except Exception as e:
    print("import failed:", e)

cwd: /Users/toshiyukimannen/ghq/github.com/Rtosshy/visual_story_understanding/notebook/eda
project root guessed: /Users/toshiyukimannen/ghq/github.com/Rtosshy/visual_story_understanding
src is importable: <module 'src.paths' from '/Users/toshiyukimannen/ghq/github.com/Rtosshy/visual_story_understanding/src/paths.py'>


In [None]:
import json
import random
import re

import pandas as pd
from num2words import num2words
from src.paths import VIST_IMAGE_ROOT, VIST_JSON_ROOT

In [None]:
dii_test_json_path = VIST_JSON_ROOT / "dii" / "test.description-in-isolation.json"
sis_test_json_path = VIST_JSON_ROOT / "sis" / "test.story-in-sequence.json"
test_image_path = VIST_IMAGE_ROOT / "test"

In [3]:
def load_json(path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

In [4]:
dii_test = load_json(dii_test_json_path)
sis_test = load_json(sis_test_json_path)

In [5]:
print(dii_test.keys())
print(sis_test.keys())

dict_keys(['images', 'info', 'albums', 'type', 'annotations'])
dict_keys(['images', 'info', 'albums', 'type', 'annotations'])


In [None]:
def convert_numbers_to_words(text: str) -> str:
    # 連続する数字をまとめて変換
    return re.sub(r"\d+", lambda m: num2words(int(m.group()), lang="en"), text)

In [7]:
dii_test_annotations_df = pd.DataFrame(dii_test["annotations"])
dii_test_annotations_df = dii_test_annotations_df[0].apply(pd.Series)
dii_test_annotations_df.head(10)

Unnamed: 0,original_text,album_id,photo_flickr_id,photo_order_in_story,worker_id,text,tier
0,The sign is describing when the services will ...,44277,1741642,0,X8GRFNA3PAZ87NZ,the sign is describing when the services will ...,descriptions-in-isolation
1,Sitting there waiting on someone to come over ...,44277,1741640,1,PB3OFZ78X1DCMNG,sitting there waiting on someone to come over ...,descriptions-in-isolation
2,"a case full of books in a house, books appear ...",44277,1741632,2,HLW494OD75LWCLB,"a case full of books in a house , books appear...",descriptions-in-isolation
3,"A older man with a black hat, mustache and gla...",44277,1741622,3,W8PVUI94S55I9MR,"a older man with a black hat , mustache and gl...",descriptions-in-isolation
4,A man in a top hat has a magic trick on the fl...,44277,1741587,4,NHSTZ8KYF85P97G,a man in a top hat has a magic trick on the fl...,descriptions-in-isolation
5,outer brick wall with red sign indicating name...,44277,1741642,0,Z8EVTCFDLQ7XIN7,outer brick wall with red sign indicating name...,descriptions-in-isolation
6,A red table with many cards and memorabilia di...,44277,1741640,1,X9VMA8VJB1W1P2O,a red table with many cards and memorabilia di...,descriptions-in-isolation
7,"A box on the ground full of books,",44277,1741632,2,FBK532ILCABYOS0,"a box on the ground full of books ,",descriptions-in-isolation
8,man talking to someone wearing glasses and a hat,44277,1741622,3,3EDPCPIXOFVNSHK,man talking to someone wearing glasses and a hat,descriptions-in-isolation
9,The man is kneeling down looking at some objec...,44277,1741587,4,L3OOAQVTECPJXM0,the man is kneeling down looking at some objec...,descriptions-in-isolation


In [8]:
sis_test_annotations_df = pd.DataFrame(sis_test["annotations"])
sis_test_annotations_df = sis_test_annotations_df[0].apply(pd.Series)
sis_test_annotations_df.head(10)

Unnamed: 0,original_text,album_id,photo_flickr_id,setting,worker_id,story_id,tier,worker_arranged_photo_order,text,storylet_id
0,The local parish holds a craft show each year.,44277,1741642,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,0,the local parish holds a craft show each year .,227650
1,Lots of folks come out and set up tables to se...,44277,1741640,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,1,lots of folks come out and set up tables to se...,227651
2,Some of these crafts are very unique and take ...,44277,1741632,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,2,some of these crafts are very unique and take ...,227652
3,Folks of all ages come out to peruse the craft...,44277,1741622,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,3,folks of all ages come out to peruse the craft...,227653
4,Some of the crafters even dress up in unique c...,44277,1741587,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,4,some of the crafters even dress up in unique c...,227654
5,I was so excited to be heading to the crafts f...,44277,1741625,first-2-pick-and-tell,Y6ZNXABP31FUVQ8,45531,story-in-sequence,0,i was so excited to be heading to the crafts f...,227655
6,When I arrived I saw a great booth with a vari...,44277,1741640,first-2-pick-and-tell,Y6ZNXABP31FUVQ8,45531,story-in-sequence,1,when i arrived i saw a great booth with a vari...,227656
7,I stopped at chatted at my friend Beth's booth...,44277,1741639,first-2-pick-and-tell,Y6ZNXABP31FUVQ8,45531,story-in-sequence,2,i stopped at chatted at my friend [female] 's ...,227657
8,There were even booths set up for all of the k...,44277,1741633,first-2-pick-and-tell,Y6ZNXABP31FUVQ8,45531,story-in-sequence,3,there were even booths set up for all of the k...,227658
9,"I found some awesome crafts at the fair, I'm r...",44277,1741630,first-2-pick-and-tell,Y6ZNXABP31FUVQ8,45531,story-in-sequence,4,"i found some awesome crafts at the fair , i 'm...",227659


In [None]:
# DataFrame の text に含まれる数字を文字列に変換
sis_test_annotations_df["text"] = sis_test_annotations_df["text"].apply(
    convert_numbers_to_words
)
dii_test_annotations_df["text"] = dii_test_annotations_df["text"].apply(
    convert_numbers_to_words
)

In [None]:
grouped = (
    sis_test_annotations_df.sort_values(["story_id", "worker_arranged_photo_order"])
    .groupby("story_id")
    .apply(lambda df: {row["photo_flickr_id"]: row["text"] for _, row in df.iterrows()})
    .to_dict()
)
grouped

  .apply(lambda df: {row['photo_flickr_id']: row['text'] for _, row in df.iterrows()})


{'45530': {'1741642': 'the local parish holds a craft show each year .',
  '1741640': 'lots of folks come out and set up tables to sell their crafts .',
  '1741632': 'some of these crafts are very unique and take a lot of talent to make .',
  '1741622': 'folks of all ages come out to peruse the crafts for sale .',
  '1741587': 'some of the crafters even dress up in unique costumes as part of their selling act .'},
 '45531': {'1741625': 'i was so excited to be heading to the crafts fair .',
  '1741640': 'when i arrived i saw a great booth with a variety of great crafts .',
  '1741639': "i stopped at chatted at my friend [female] 's booth for a bit .",
  '1741633': 'there were even booths set up for all of the kids .',
  '1741630': "i found some awesome crafts at the fair , i 'm really happy that i went ."},
 '45532': {'1741642': 'the church is old , but it has a nice history .',
  '1741640': 'they display this history during the afternoon .',
  '1741632': 'some books even talk about loc

In [11]:
story_id = "45530"
story_df = sis_test_annotations_df[sis_test_annotations_df["story_id"] == story_id]
story_df

Unnamed: 0,original_text,album_id,photo_flickr_id,setting,worker_id,story_id,tier,worker_arranged_photo_order,text,storylet_id
0,The local parish holds a craft show each year.,44277,1741642,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,0,the local parish holds a craft show each year .,227650
1,Lots of folks come out and set up tables to se...,44277,1741640,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,1,lots of folks come out and set up tables to se...,227651
2,Some of these crafts are very unique and take ...,44277,1741632,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,2,some of these crafts are very unique and take ...,227652
3,Folks of all ages come out to peruse the craft...,44277,1741622,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,3,folks of all ages come out to peruse the craft...,227653
4,Some of the crafters even dress up in unique c...,44277,1741587,first-2-pick-and-tell,FJROI8NWDRIPAM1,45530,story-in-sequence,4,some of the crafters even dress up in unique c...,227654


In [12]:
answer_dict = {}
modified_dict = {}
drop_pos = 2  # 除外したいworker_arranged_photo_order

for story_id, group in sis_test_annotations_df.groupby("story_id"):
    group = group.sort_values("worker_arranged_photo_order")
    # 答え（order==drop_posのもの）
    answer_row = group[group["worker_arranged_photo_order"] == drop_pos]
    if not answer_row.empty:
        ans_photo_id = answer_row.iloc[0]["photo_flickr_id"]
        ans_text = answer_row.iloc[0]["text"]
        answer_dict[story_id] = {ans_photo_id: ans_text}
    # modified_dictにはdrop_posのものを入れない
    photo_text_dict = {
        row["photo_flickr_id"]: row["text"]
        for _, row in group.iterrows()
        if row["worker_arranged_photo_order"] != drop_pos
    }
    modified_dict[story_id] = photo_text_dict


In [13]:
# 答えの例
print("answer_dictの例:", list(answer_dict.items())[0])
# 空にしたデータの例
print("modified_dictの例:", list(modified_dict.items())[0])

answer_dictの例: ('45530', {'1741632': 'some of these crafts are very unique and take a lot of talent to make .'})
modified_dictの例: ('45530', {'1741642': 'the local parish holds a craft show each year .', '1741640': 'lots of folks come out and set up tables to sell their crafts .', '1741622': 'folks of all ages come out to peruse the crafts for sale .', '1741587': 'some of the crafters even dress up in unique costumes as part of their selling act .'})


In [14]:
answers = list(answer_dict.items())
questions = list(modified_dict.items())

In [15]:
print(answers[:2])
print(questions[:2])

[('45530', {'1741632': 'some of these crafts are very unique and take a lot of talent to make .'}), ('45531', {'1741639': "i stopped at chatted at my friend [female] 's booth for a bit ."})]
[('45530', {'1741642': 'the local parish holds a craft show each year .', '1741640': 'lots of folks come out and set up tables to sell their crafts .', '1741622': 'folks of all ages come out to peruse the crafts for sale .', '1741587': 'some of the crafters even dress up in unique costumes as part of their selling act .'}), ('45531', {'1741625': 'i was so excited to be heading to the crafts fair .', '1741640': 'when i arrived i saw a great booth with a variety of great crafts .', '1741633': 'there were even booths set up for all of the kids .', '1741630': "i found some awesome crafts at the fair , i 'm really happy that i went ."})]


In [16]:
print(len(answers))
print(len(questions))

5055
5055


In [None]:
options_dict = {}
answer_index_dict = {}

n_options = 4  # 選択肢数
seed = 42
random.seed(seed)  # 一度だけseedを設定

# 全storyのphoto_flickr_id-textペアを集める（重複除外）
all_photo_text = set()
for d in modified_dict.values():
    all_photo_text.update(d.items())
all_photo_text = list(all_photo_text)

for story_id in answer_dict.keys():
    correct_pair = list(answer_dict[story_id].items())[0]  # (photo_flickr_id, text)
    # 不正解候補（正解と同じphoto_flickr_id/textは除外）
    negatives = [
        item
        for item in all_photo_text
        if item[0] != correct_pair[0] and item[1] != correct_pair[1]
    ]
    sampled_neg = random.sample(negatives, min(n_options - 1, len(negatives)))
    # optionsリスト作成
    options = [correct_pair] + sampled_neg
    random.shuffle(options)
    # photo_flickr_id: text の辞書に変換
    options_dict[story_id] = {k: v for k, v in options}
    # 正解の位置をstory_id: indexで記録
    correct_index = [i for i, (k, _) in enumerate(options) if k == correct_pair[0]][0]
    answer_index_dict[story_id] = correct_index

# 例の出力
sample_story_id = list(options_dict.keys())[0]
print("options例:", options_dict[sample_story_id])
print("正解インデックス例:", answer_index_dict[sample_story_id])

options例: {'45185352': 'champagne , a limo . next stop , honeymoon .', '1183920': 'here i am getting ready .', '1741632': 'some of these crafts are very unique and take a lot of talent to make .', '1584818205': 'this is a prime example of how hard working ants really are .'}
正解インデックス例: 2


In [18]:
options = list(options_dict.items())
answer_indices = list(answer_index_dict.items())

In [19]:
print(len(options))
print(len(answer_indices))

5055
5055


In [20]:
print(options[:2])
print(answer_indices[:10])

[('45530', {'45185352': 'champagne , a limo . next stop , honeymoon .', '1183920': 'here i am getting ready .', '1741632': 'some of these crafts are very unique and take a lot of talent to make .', '1584818205': 'this is a prime example of how hard working ants really are .'}), ('45531', {'176018513': 'we got to sit really close to the players .', '4495759906': 'the makeup artist did their job .', '1741639': "i stopped at chatted at my friend [female] 's booth for a bit .", '5110133803': 'plain [female] , the barkeep , enjoyed the show , and would keep smiling , until they later burnt her at the stake , mistaking her for a witch .'})]
[('45530', 2), ('45531', 2), ('45532', 3), ('45533', 0), ('45534', 1), ('45535', 3), ('45536', 2), ('45537', 3), ('45538', 3), ('45539', 3)]


In [None]:
final_json = {}

for story_id in answer_dict.keys():
    final_json[story_id] = {
        "question": modified_dict[story_id],  # ポジション2が欠けた画像テキスト列の辞書
        "answer": answer_dict[story_id],  # 正解の画像IDとテキストの辞書
        "option": options_dict[story_id],  # 選択肢となる画像テキスト対の辞書
        "answer_idx": answer_index_dict[story_id],  # 正解の選択肢インデックス
        "drop_pos": drop_pos,  # 除外したポジション
    }

# 例の出力（最初の1件だけ表示）
sample_story_id = list(final_json.keys())[0]

print(
    json.dumps(
        {sample_story_id: final_json[sample_story_id]}, ensure_ascii=False, indent=2
    )
)

{
  "45530": {
    "question": {
      "1741642": "the local parish holds a craft show each year .",
      "1741640": "lots of folks come out and set up tables to sell their crafts .",
      "1741622": "folks of all ages come out to peruse the crafts for sale .",
      "1741587": "some of the crafters even dress up in unique costumes as part of their selling act ."
    },
    "answer": {
      "1741632": "some of these crafts are very unique and take a lot of talent to make ."
    },
    "option": {
      "45185352": "champagne , a limo . next stop , honeymoon .",
      "1183920": "here i am getting ready .",
      "1741632": "some of these crafts are very unique and take a lot of talent to make .",
      "1584818205": "this is a prime example of how hard working ants really are ."
    },
    "answer_idx": 2,
    "drop_pos": 2
  }
}


In [None]:
# JSONL で保存
output_path = "seq2opt.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for story_id, rec in final_json.items():
        record = {"story_id": story_id, **rec}
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"保存しました: {output_path}")