In [None]:
import glob, os
import pandas as pd
import pickle
import seaborn as sns
import re

In [None]:
data = {
    "term_type": [],
    "term_group": [],
    "term": [],
    "surprisal": [],
    "surprisal_type": [],
}
for f in glob.glob("semantic_surprisals/*"):
    term_group = os.path.basename(f).replace(".pickle", "")
    with open(f, "rb") as file:
        term_surprisals, term_types = pickle.load(file)
    
    for term_idx, term in enumerate(term_surprisals):
        for surprisal in term_surprisals[term]:
            term_type = term_types[term_idx]
            data["term_type"].append(term_type)
            data["term_group"].append(term_group)
            data["term"].append(term)
            data["surprisal"].append(surprisal)
            data["surprisal_type"].append("semantic")

for f in glob.glob("llm_surprisals/*"):
    term_group = os.path.basename(f).replace(".pickle", "")
    with open(f, "rb") as file:
        term_surprisals, term_types = pickle.load(file)
    
    for term_idx, term in enumerate(term_surprisals):
        for surprisal in term_surprisals[term]:
            term_type = term_types[term_idx]
            data["term_type"].append(term_type)
            data["term_group"].append(term_group)
            data["term"].append(term)
            data["surprisal"].append(surprisal)
            data["surprisal_type"].append("llm")

for f in glob.glob("ngram_surprisals/*"):
    term_group = os.path.basename(f).replace(".pickle", "")
    with open(f, "rb") as file:
        term_surprisals, term_types = pickle.load(file)
    
    for term_idx, term in enumerate(term_surprisals):
        for surprisal in term_surprisals[term]:
            term_type = term_types[term_idx]
            data["term_type"].append(term_type)
            data["term_group"].append(term_group)
            data["term"].append(term)
            data["surprisal"].append(surprisal)
            data["surprisal_type"].append("ngram")

In [None]:
with open("selected_sentences.pickle", "rb") as f:
    sentences = pickle.load(f)

term_lists = [
    (["兩造",], ["雙方"]),
    (["上揭", "上開", "前揭", "前開", "首揭"], ["前述", "上述"]),
    (["云云",], ["等陳述", "等語", "等等"]),
    (["可考", "可佐", "可按", "可稽", "可證", "足按", "足徵", "足稽", "足憑", "足證"], ["可以佐證", "可供證明", "可以證明", "足以佐證", "足以證明"]),
    (["迭",], ["接連", "多次"]),
    (["拘束",], ["限制",]),
    (["失所附麗",], ["失所依附",]),
    (["尚非無稽", "尚非無憑", "尚非無據", "尚非虛妄", "尚非臆造"], ["應可採信", "應屬事實", "並非全無依據", "並不是完全沒有依據"]),
    (["所載",], ["所記載",]),
    (["考諸", "徵諸", "觀諸", "稽之"], ["參考", "依照", "依據"]),
    (["質言之",], ["簡言之",]),
    (["相歧",], ["矛盾",]),
    (["即非法所不許", "依法即無不合",], ["符合法律規定",]),
    (["礙難採認",], ["難以認定", "難以採信", "不可採"]),
    (["矧",], ["況且",]),
    (["翻異"], ["推翻"]),
    (["乃",], ["於是",]),
    (["自白不諱", "供認不諱", "坦承不諱"], ["坦白承認"]),
    (["似無可採", "似屬無憑", "即無可採", "即屬無據", "尚無可採", "尚難憑採", "要非可信", "要屬虛言", "容非可採"], ["難以採信", "不可採信", "尚不足採信", "尚不足採證"]),
    (["顯有",], ["顯然有", "顯然屬於", "顯然是"])
]


term_list = []
for legal_terms, usual_terms in term_lists:

    terms = legal_terms+usual_terms
    print("|".join(terms))
    save_path = os.path.join("llm_surprisals", "|".join(terms))

    term_types = ["legal"]*len(legal_terms)+["usual"]*len(usual_terms)

    regex_str = "|".join(terms)

    for sentence1, sentence2 in zip(sentences[:-1], sentences[1:]):

        matches = list(re.finditer(regex_str, sentence2))
        if matches:
            for match in matches:
                term_list.append(match.group())

In [None]:
from sklearn.preprocessing import StandardScaler

df = pd.DataFrame(data)
df = df[df["surprisal_type"]=="semantic"].reset_index(drop=True)

df_semantic = pd.DataFrame(data)
df_semantic = df_semantic[df_semantic["surprisal_type"]=="semantic"].reset_index(drop=True)

df_llm = pd.DataFrame(data)
df_llm = df_llm[df_llm["surprisal_type"]=="llm"].reset_index(drop=True)

df_ngram = pd.DataFrame(data)
df_ngram = df_ngram[df_ngram["surprisal_type"]=="ngram"].reset_index(drop=True)

In [None]:
df_combined = pd.concat([df_semantic, df_llm, df_ngram], ignore_index=True)
df_combined.replace([float('inf'), float('-inf')], float('nan'), inplace=True)
df_combined.dropna(inplace=True)
df_combined["frequency"] = df_combined["term"].apply(lambda x: term_list.count(x))
df_combined["term_type"] = df_combined["term_type"].apply(lambda x: x if x=="usual" else "legal")
df_combined.to_csv("results.csv", index=False)