In [1]:
import json
import re

def preprocess_line(line: str) -> str:
    parts = line.strip().split("\t")
    if len(parts) != 4:
        return ""  # 非法行丢弃
    
    subj, relation, obj, date = parts
    
    # 下划线 → 空格
    subj = subj.replace("_", " ")
    obj = obj.replace("_", " ")
    
    # relation 处理：去括号、下划线
    relation = relation.replace("_", " ")
    relation = re.sub(r"\(.*?\)", "", relation).strip()
    
    # 拼成句子
    sentence = f"{subj} {relation.lower()} with {obj} on {date}."
    return re.sub(" +", " ", sentence)  # 去多余空格


In [None]:
input_file = "/home/share/data/dataset/MultiTQ/train.txt"
output_file = "/home/share/data/dataset/MultiTQ/train.json"
    
results = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        processed = preprocess_line(line)
        if processed:  # 非空才加入
            results.append({
                "title": "Temporal Event of Political, Diplomatic, and Social Interactions",
                "context": processed
            })
    
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
    
print(f"保存完成，共 {len(results)} 条数据 -> {output_file}")