In [3]:
import pandas as pd
import re
import ast

In [5]:
DATA_PATH = "../nbme-score-clinical-patient-notes/"

features_df = pd.read_csv(f"{DATA_PATH}features.csv")
notes_df = pd.read_csv(f"{DATA_PATH}patient_notes.csv")
train_df_raw = pd.read_csv(f"{DATA_PATH}train.csv")

df = train_df_raw.merge(features_df, on=['case_num', 'feature_num'], how='left')
df = df.merge(notes_df, on=['case_num', 'pn_num'], how='left')

In [26]:
def simple_token_match(row):
    tokens = re.findall(r'\w+', row["feature_text"].lower())
    pn_lower = row["pn_history"].lower()

    matched_spans = []

    for token in tokens:
        for match in re.finditer(re.escape(token), pn_lower):
            start = match.start()
            end = match.end()
            matched_spans.append(f"{start} {end}")

    return matched_spans

In [27]:
def phrase_match(row):
    phrase = row["feature_text"].lower()
    pn_lower = row["pn_history"].lower()

    matched_spans = []

    for match in re.finditer(re.escape(phrase), pn_lower):
        start = match.start()
        end = match.end()
        matched_spans.append(f"{start} {end}")

    return matched_spans

In [31]:
def parse_spans(span_list):
    if isinstance(span_list, str):
        try:
            span_list = ast.literal_eval(span_list)
        except:
            return set()

    if not isinstance(span_list, list):
        return set()

    result = set()
    for span in span_list:
        # Beispiel: "126 131;143 151"
        for part in span.split(";"):
            try:
                start, end = map(int, part.strip().split())
                result.update(range(start, end))
            except:
                continue
    return result

In [32]:
def jaccard_score(pred_spans, true_spans):
    pred_tokens = parse_spans(pred_spans)
    true_tokens = parse_spans(true_spans)

    if not pred_tokens and not true_tokens:
        return 1.0

    intersection = len(pred_tokens & true_tokens)
    union = len(pred_tokens | true_tokens)

    if union == 0:
        return 0.0

    return intersection / union

In [33]:
df["location_pred_token"] = df.apply(simple_token_match, axis=1)
df["location_pred_phrase"] = df.apply(phrase_match, axis=1)

df["jaccard_token"] = df.apply(lambda row: jaccard_score(row["location_pred_token"], row["location"]), axis=1)
df["jaccard_phrase"] = df.apply(lambda row: jaccard_score(row["location_pred_phrase"], row["location"]), axis=1)

print(f"📊 Mean Jaccard Score (Soft Token Matching):   {df['jaccard_token'].mean():.4f}")
print(f"📊 Mean Jaccard Score (Phrase Matching):       {df['jaccard_phrase'].mean():.4f}")

📊 Mean Jaccard Score (Soft Token Matching):   0.2945
📊 Mean Jaccard Score (Phrase Matching):       0.3572
