In [2]:
import pandas as pd
import spacy
import sklearn_crfsuite
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF, metrics
from ast import literal_eval
from tqdm import tqdm
from collections import defaultdict


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/homebrew/Caskroom/miniconda/base/envs/finetune/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/homebrew/Caskroom/miniconda/base/envs/finetune/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/homebrew/Caskroom/miniconda/base/envs/finetune/lib/python3.12/site-packages/i

In [7]:
DATA_PATH = "../nbme-score-clinical-patient-notes/"
MODEL_DIR = "crf_models_unified" # Directory to save the unified model

os.makedirs(MODEL_DIR, exist_ok=True)

try:
    features_df = pd.read_csv(f"{DATA_PATH}features.csv")
    notes_df = pd.read_csv(f"{DATA_PATH}patient_notes.csv")
    train_df_raw = pd.read_csv(f"{DATA_PATH}train.csv")
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    print(f"Please ensure the data files are located in '{DATA_PATH}' relative to the script.")
    exit()

df = train_df_raw.merge(features_df, on=["feature_num", "case_num"], how="left")
df = df.merge(notes_df, on=["pn_num", "case_num"], how="left")

df = df[~df['location'].isna() & (df['location'] != '[]')]

print(f"Anzahl der verbleibenden Patient Notes (pn_num): {df['pn_num'].count()}")
df.head()

Anzahl der verbleibenden Patient Notes (pn_num): 9901


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],['203 217'],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [9]:
nlp = spacy.load("en_core_web_sm")
grouped = df.groupby("pn_num")
data = []

def split_locations(loc_str):
    if not isinstance(loc_str, str) or loc_str == "[]":
        return []
    locs = literal_eval(loc_str)  # z. B. ['85 99', '126 131;143 151']
    all_spans = []
    for loc in locs:
        sub_spans = loc.split(";")  # → ['126 131', '143 151']
        all_spans.extend(sub_spans)
    return all_spans

for pn_num, group in tqdm(grouped, desc="Processing notes"):
    text = group.iloc[0]["pn_history"]
    doc = nlp(text)
    tokens = [token.text for token in doc]
    tags = ["O"] * len(tokens)

    token_starts = [token.idx for token in doc]
    token_ends = [token.idx + len(token) for token in doc]

    for _, row in group.iterrows():
        spans = split_locations(row["location"])
        for span in spans:
            try:
                start, end = map(int, span.split())
            except:
                continue
            for i, (tok_start, tok_end) in enumerate(zip(token_starts, token_ends)):
                if tok_start >= start and tok_end <= end:
                    if tags[i] == "O":
                        tags[i] = f"B-{row['feature_num']}"
                    else:
                        tags[i] = f"I-{row['feature_num']}"

    data.append({"tokens": list(zip(tokens, tags)), "text": text})

Processing notes: 100%|██████████| 1000/1000 [00:26<00:00, 37.04it/s]


In [10]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        "bias": 1.0,
        "word.lower()": word.lower(),
        "word[-3:]": word[-3:],
        "word[-2:]": word[-2:],
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1][0]
        features.update({
            "-1:word.lower()": word1.lower(),
            "-1:word.istitle()": word1.istitle(),
            "-1:word.isupper()": word1.isupper(),
        })
    else:
        features["BOS"] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        features.update({
            "+1:word.lower()": word1.lower(),
            "+1:word.istitle()": word1.istitle(),
            "+1:word.isupper()": word1.isupper(),
        })
    else:
        features["EOS"] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [11]:
from sklearn.model_selection import train_test_split

# vorbereiten
X_all = [sent2features(d["tokens"]) for d in data]
y_all = [sent2labels(d["tokens"]) for d in data]
texts_all = [d["text"] for d in data]
tokens_all = [d["tokens"] for d in data]

# Split
X_train, X_test, y_train, y_test, texts_train, texts_test, tokens_train, tokens_test = train_test_split(
    X_all, y_all, texts_all, tokens_all, test_size=0.1, random_state=42
)

In [12]:
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True,
)
crf.fit(X_train, y_train)
joblib.dump(crf, "crf_baseline_model.joblib")


loading training data to CRFsuite: 100%|██████████| 900/900 [00:00<00:00, 1125.79it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 103041
Seconds required: 0.416

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=21.69 loss=680274.08 active=102341 feature_norm=1.00
Iter 2   time=64.92 loss=642317.91 active=102309 feature_norm=12.20
Iter 3   time=10.84 loss=485821.50 active=98379 feature_norm=9.24
Iter 4   time=152.32 loss=351249.98 active=96937 feature_norm=4.41
Iter 5   time=43.60 loss=281986.82 active=99455 feature_norm=6.97
Iter 6   time=54.70 loss=208771.48 active=100851 feature_norm=7.21
Iter 7   time=97.64 loss=206979.88 active=102317 feature_norm=7.34
Iter 8   time=86.87 loss=206439.09 active=102840 feature_norm=7.36
Iter 9   time=86.95 loss=205770.54 active=102114 feature_norm=7

['crf_baseline_model.joblib']

In [13]:
y_pred = crf.predict(X_test)

print("F1 Score (weighted):", metrics.flat_f1_score(y_test, y_pred, average="weighted"))
print(metrics.flat_classification_report(y_test, y_pred))

F1 Score (weighted): 0.8680956495966081
              precision    recall  f1-score   support

         B-0       0.89      0.47      0.62        36
         B-1       0.96      1.00      0.98        48
        B-10       0.87      0.65      0.74        60
       B-100       0.00      0.00      0.00         2
       B-101       0.00      0.00      0.00         2
       B-102       1.00      0.16      0.27        19
       B-103       0.43      0.57      0.49        23
       B-104       0.43      0.50      0.46        12
       B-105       0.75      0.83      0.79        18
       B-106       1.00      0.50      0.67        26
       B-107       0.89      0.76      0.82        21
       B-108       0.00      0.00      0.00        15
       B-109       0.20      0.08      0.11        13
        B-11       0.55      0.65      0.60        26
       B-110       0.62      0.62      0.62        21
       B-111       0.90      0.85      0.88        33
       B-112       0.50      0.50      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
def extract_predicted_locations(labels, doc):
    spans = []
    start = None
    for i, tag in enumerate(labels):
        if tag.startswith("B-"):
            if start is not None:
                end = doc[i - 1].idx + len(doc[i - 1])
                spans.append(f"{start} {end}")
            start = doc[i].idx
        elif tag == "O" and start is not None:
            end = doc[i - 1].idx + len(doc[i - 1])
            spans.append(f"{start} {end}")
            start = None
    if start is not None:
        end = doc[-1].idx + len(doc[-1])
        spans.append(f"{start} {end}")
    return spans

def parse_true_spans(label_seq, doc):
    spans = []
    start = None
    for i, tag in enumerate(label_seq):
        if tag.startswith("B-"):
            start = doc[i].idx
        elif tag == "O" and start is not None:
            end = doc[i - 1].idx + len(doc[i - 1])
            spans.append(f"{start} {end}")
            start = None
    if start is not None:
        end = doc[-1].idx + len(doc[-1])
        spans.append(f"{start} {end}")
    return spans

def jaccard_score(gt_spans, pred_spans):
    gt_chars = set()
    pred_chars = set()
    for span in gt_spans:
        s, e = map(int, span.split())
        gt_chars.update(range(s, e))
    for span in pred_spans:
        s, e = map(int, span.split())
        pred_chars.update(range(s, e))
    if not gt_chars and not pred_chars:
        return 1.0
    if not gt_chars or not pred_chars:
        return 0.0
    return len(gt_chars & pred_chars) / len(gt_chars | pred_chars)

In [16]:
scores = []

for i in tqdm(range(len(X_test)), desc="Evaluating Jaccard"):
    text = texts_test[i]
    sent = tokens_test[i]
    doc = nlp(text)

    pred = y_pred[i]
    true = y_test[i]

    pred_spans = extract_predicted_locations(pred, doc)
    true_spans = parse_true_spans(true, doc)

    score = jaccard_score(true_spans, pred_spans)
    scores.append(score)

print(f"Durchschnittlicher Jaccard Score: {sum(scores)/len(scores):.4f}")

Evaluating Jaccard: 100%|██████████| 100/100 [00:02<00:00, 38.63it/s]

Durchschnittlicher Jaccard Score: 0.2979



