In [None]:
import glob, os
import pandas as pd
import pickle
import seaborn as sns
import re, tqdm

In [None]:
data = {
    "term_type": [],
    "term_group": [],
    "term": [],
    "surprisal": [],
    "surprisal_type": [],
}
for f in glob.glob("semantic_surprisals/*"):
    term_group = os.path.basename(f).replace(".pickle", "")
    with open(f, "rb") as file:
        term_surprisals, term_types = pickle.load(file)
    
    for term_idx, term in enumerate(term_surprisals):
        for surprisal in term_surprisals[term]:
            term_type = term_types[term_idx]
            data["term_type"].append(term_type)
            data["term_group"].append(term_group)
            data["term"].append(term)
            data["surprisal"].append(surprisal)
            data["surprisal_type"].append("semantic")

for f in glob.glob("llm_surprisals/*"):
    term_group = os.path.basename(f).replace(".pickle", "")
    with open(f, "rb") as file:
        term_surprisals, term_types = pickle.load(file)
    
    for term_idx, term in enumerate(term_surprisals):
        for surprisal in term_surprisals[term]:
            term_type = term_types[term_idx]
            data["term_type"].append(term_type)
            data["term_group"].append(term_group)
            data["term"].append(term)
            data["surprisal"].append(surprisal)
            data["surprisal_type"].append("llm")

for f in glob.glob("ngram_surprisals/*"):
    term_group = os.path.basename(f).replace(".pickle", "")
    with open(f, "rb") as file:
        term_surprisals, term_types = pickle.load(file)
    
    for term_idx, term in enumerate(term_surprisals):
        for surprisal in term_surprisals[term]:
            term_type = term_types[term_idx]
            data["term_type"].append(term_type)
            data["term_group"].append(term_group)
            data["term"].append(term)
            data["surprisal"].append(surprisal)
            data["surprisal_type"].append("ngram")

In [None]:
df = pd.DataFrame(data)

In [None]:
import glob, re

sentences = []
for f in glob.glob("PMC000xxxxxx/*.txt"):
    with open(f, "r", encoding="latin-1") as file:
        content = file.read()
    for sentence in content.split("\n"):
        if sentence.strip() and not sentence.startswith("==== "):
            sentence = sentence.strip()

            sentences+=re.split(r"[{. }!?;]\s+", sentence.replace("\t", " "))
sentences = [s for s in sentences if len(s)>0]
sentences = [s.lower() for s in sentences]

term_lists = [
[[["bulla", "bullae"]], [["blister", "blisters"],]],
[[["candida albicans", "candidiasis"]], [["thrush",]]],
[[["carbohydrate", "carbohydrates"]], [["carb", "carbs"]]],
[[["chemotherapy", "chemotherapies"]], [["chemo", "chemoes"]]],
[[["chronic pain", "chronic pains"]], [["persistent pain", "persistent pains"]]],
[[["comedo", "comedos"]], [["whitehead", "whiteheads"]]],
[[["dermis",], ["epidermis",]], [["skin"]]],
[[["dyspepsia"]], [["indigestion"]]],
[[["erythrocyte", "erythrocytes"]], [["red blood cell", "red blood cells"]]],
[[["febrile"]], [["feverish"]]],
[[["haemorrhage"]], [["heavy bleeding"]]],
[[["herpes zoster"]], [["chickenpox"]]],
[[["hypertension"]], [["high blood pressure", "high blood pressures"]]],
[[["hypotension"]], [["low blood pressure", "low blood pressures"]]],
[[["influenza"]], [["flu"]]],
[[["inhaler"]], [["puffer"]]],
[[["intestine"]], [["guts"]]],
[[["lethargy"]], [["tiredness"]]],
[[["leukocyte", "leukocytes"]], [["white blood cell", "white blood cells"]]],
[[["myocardial infarction"]], [["heart attack"]]],
[[["pneumonia"]], [["lung infection"]]],
[[["renal failure"]], [["kidney failure"]]],
[[["thrombocytopenia"]], [["low platelet count", "low platelet counts"]]],
[[["liposuction"]], [["lipo"]]],
[[["melanoma"]], [["skin cancer"]]],
]


term_list = []
for legal_terms, usual_terms in tqdm.tqdm(term_lists):

    terms = legal_terms+usual_terms
    
    for term in terms:
        
        if type(term)!=list: raise ValueError("Term must be a list")
        regex_str = fr'\b(?:{"|".join(term)})\b'

        for sentence1, sentence2 in zip(sentences[:-1], sentences[1:]):

            matches = list(re.finditer(regex_str, sentence2))
            if matches:
                for match in matches:

                    term_list.append(term[0])

In [None]:
from sklearn.preprocessing import StandardScaler

df_semantic = pd.DataFrame(data)
df_semantic = df_semantic[df_semantic["surprisal_type"]=="semantic"].reset_index(drop=True)

df_llm = pd.DataFrame(data)
df_llm = df_llm[df_llm["surprisal_type"]=="llm"].reset_index(drop=True)

df_ngram = pd.DataFrame(data)
df_ngram = df_ngram[df_ngram["surprisal_type"]=="ngram"].reset_index(drop=True)

In [None]:
scalers_semantic = {}
for term_group in set(df_semantic["term_group"]):
    scalers_semantic[term_group] = StandardScaler()
    term_data = df_semantic[df_semantic["term_group"] == term_group]["surprisal"].values.reshape(-1, 1)
    scalers_semantic[term_group].fit(term_data)

df_semantic["surprisal"] = df_semantic.apply(
    lambda row: scalers_semantic[row["term_group"]].transform([[row["surprisal"]]])[0][0], axis=1
)

scalers_llm = {}
for term_group in set(df_llm["term_group"]):
    scalers_llm[term_group] = StandardScaler()
    term_data = df_llm[df_llm["term_group"] == term_group]["surprisal"].values.reshape(-1, 1)
    scalers_llm[term_group].fit(term_data)
df_llm["surprisal"] = df_llm.apply(
    lambda row: scalers_llm[row["term_group"]].transform([[row["surprisal"]]])[0][0], axis=1
)
scalers_ngram = {}
for term_group in set(df_ngram["term_group"]):
    scalers_ngram[term_group] = StandardScaler()
    term_data = df_ngram[df_ngram["term_group"] == term_group]["surprisal"].values.reshape(-1, 1)
    scalers_ngram[term_group].fit(term_data)
df_ngram["surprisal"] = df_ngram.apply(
    lambda row: scalers_ngram[row["term_group"]].transform([[row["surprisal"]]])[0][0], axis=1
)

In [None]:
df_combined = pd.concat([df_semantic, df_llm, df_ngram], ignore_index=True)
df_combined.replace([float('inf'), float('-inf')], float('nan'), inplace=True)
df_combined.dropna(inplace=True)
df_combined["frequency"] = df_combined["term"].apply(lambda x: term_list.count(x))
df_combined["term_type"] = df_combined["term_type"].apply(lambda x: x if x=="usual" else "bio")
df_combined.to_csv("results.csv", index=False)