In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json
import pandas as pd
from collections import defaultdict

In [3]:
test_path = "/content/drive/MyDrive/Colab Notebooks/Combination model/test.jsonl"
pred_path = "/content/drive/MyDrive/Colab Notebooks/Combination model/task1_prediction.csv"

with open(test_path, "r") as f:
    test_data = [json.loads(line) for line in f]
pred_df = pd.read_csv(pred_path)

In [4]:
# Merge predicted labels into test data.
id2tag = dict(zip(pred_df["id"], pred_df["spoilerType"]))

for item in test_data:
    item_id = item.get("id")
    item["predicted_tag"] = id2tag.get(item_id, "unknown")

for item in test_data:
    predicted = item.pop("predicted_tag", None)
    item["tags"] = [predicted] if predicted is not None else ["unknown"]

df = pd.DataFrame(test_data)
csv_path = "/content/drive/MyDrive/Colab Notebooks/Combination model/test_with_tags.csv"
df.to_csv(csv_path, index=False, encoding="utf-8-sig")

In [5]:
# Save samples of each category separately.
tag_groups = defaultdict(list)

for item in test_data:
    tag = item["tags"][0]  # e.g., 'phrase', 'passage', 'multi'
    tag_groups[tag].append(item)

base_path = "/content/drive/MyDrive/Colab Notebooks/Combination model/"

for tag, items in tag_groups.items():
    save_path = f"{base_path}test_{tag}.jsonl"
    with open(save_path, "w", encoding="utf-8") as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    print(f"✅ Saved {len(items)} samples to {save_path}")

✅ Saved 165 samples to /content/drive/MyDrive/Colab Notebooks/Combination model/test_phrase.jsonl
✅ Saved 192 samples to /content/drive/MyDrive/Colab Notebooks/Combination model/test_passage.jsonl
✅ Saved 43 samples to /content/drive/MyDrive/Colab Notebooks/Combination model/test_multi.jsonl
