# Difference analysis between AI generated en final letter

Analysis to see how many changes were made to the AI-draft discharge letter in comparison to the final discharge letter sent to the next treating physician.

In [None]:
import os
import re
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from dotenv import load_dotenv
from nltk.util import ngrams
from rich import print as rprint
from tqdm.notebook import tqdm

tqdm.pandas()
load_dotenv()
DB_USER = os.getenv("DB_USER")
DB_PASSWD = os.getenv("DB_PASSWORD")


In [None]:
# load data from application database export file
data_folder = (
    Path("/mapr/administratielast/administratielast_datamanager/ontslagdocumentatie/")
    / "Pilot Evaluatie IC NICU"
)

date = "2025-03-05"
request = pd.read_csv(data_folder / Path(date + "-apirequest.csv"))
encounter = pd.read_csv(data_folder / Path(date + "-apiencounter.csv"))
generateddoc = pd.read_csv(data_folder / Path(date + "-apigenerateddoc.csv"))
feedback = pd.read_csv(data_folder / Path(date + "-apifeedback.csv"))
data = pd.read_parquet(data_folder / Path("data_export_pilot.parquet"))
final_discharge = pd.read_csv(data_folder / "pilot_final_discharge_letters.csv")

display(request.head())
display(encounter.head())
display(generateddoc.head())
display(feedback.head())
display(data.head())
display(final_discharge.head())

In [None]:
PILOT_START_DATE = "2024-10-15"
PILOT_END_DATE = "2024-12-10"

In [None]:
# Numbers surrounding the pilot

num_encounters = generateddoc[
    (generateddoc["generation_date"] >= PILOT_START_DATE)
    & (generateddoc["generation_date"] <= PILOT_END_DATE)
]["encounter_id"].nunique()
print(f"Number of encounters: {num_encounters}")

num_generated_docs = generateddoc[
    (generateddoc["generation_date"] >= PILOT_START_DATE)
    & (generateddoc["generation_date"] <= PILOT_END_DATE)
].shape[0]
print(f"Number of generated documents: {num_generated_docs}")

num_successful_generated_docs = generateddoc[
    (generateddoc["generation_date"] >= PILOT_START_DATE)
    & (generateddoc["generation_date"] <= PILOT_END_DATE)
    & (generateddoc["success"] == "Success")
].shape[0]
print(f"Number of successful generated documents: {num_successful_generated_docs}")

failure_reasons = generateddoc[
    (generateddoc["generation_date"] >= PILOT_START_DATE)
    & (generateddoc["generation_date"] <= PILOT_END_DATE)
    & (generateddoc["success"] != "Success")
]["success"].value_counts()
print(f"Reasons for not successful generation: {failure_reasons}")

perc_enc_too_long = failure_reasons.LengthError / num_generated_docs
print(
    "Percentage of letters that were not generated because file was too long: "
    f"{perc_enc_too_long * 100} (most at NICU)"
)

length_error_encounters = generateddoc[
    (generateddoc["generation_date"] >= PILOT_START_DATE)
    & (generateddoc["generation_date"] <= PILOT_END_DATE)
    & (generateddoc["success"] == "LengthError")
]["encounter_id"]
print(f"Number of encounters with too long files: {length_error_encounters.nunique()}")


print(
    "Percentage of encounters with patient file ending up too long: "
    f"{num_encounters * 100}"
)

request[["encounter_id", "retrieved_generated_doc_id"]] = (
    request.loc[request["endpoint"] == "/retrieve_discharge_doc", "logging_number"]
    .str.extract(r"^(\d+)_([\d]+)$")
    .astype("Int64")
)
num_retrieve_requests = request.loc[
    (request["timestamp"] >= PILOT_START_DATE)
    & (request["timestamp"] <= PILOT_END_DATE)
].shape[0]
print(f"Number of retrieve requests: {num_retrieve_requests}")
num_retrieved_docs = request.loc[
    (request["timestamp"] >= PILOT_START_DATE)
    & (request["timestamp"] <= PILOT_END_DATE),
    "retrieved_generated_doc_id",
].count()
num_unique_retrieved_docs = request.loc[
    (request["timestamp"] >= PILOT_START_DATE)
    & (request["timestamp"] <= PILOT_END_DATE),
    "retrieved_generated_doc_id",
].nunique()
num_encounters_with_retrieved_docs = request.loc[
    (request["timestamp"] >= PILOT_START_DATE)
    & (request["timestamp"] <= PILOT_END_DATE),
    "encounter_id",
].nunique()

print(f"Number of times document successfully retrieved: {num_retrieved_docs}")
print(f"Number of unique documents successfully retrieved: {num_unique_retrieved_docs}")
print(
    "number of encounters with retrieved documents: "
    f"{num_encounters_with_retrieved_docs}"
)

## Combine generated docs with Metavision docs

In [None]:
metavision_discharge_docs = (
    data.loc[data["description"] == "Ontslagbrief"]
    .sort_values("date", ascending=True)
    .drop_duplicates(subset="enc_id", keep="last")
)
# Metavision letters use \r\n for newlines or double newlines and sometimes add ...
# at the end of a sentence
metavision_discharge_docs["content"] = metavision_discharge_docs["content"].str.replace(
    "\r", ""
)
metavision_discharge_docs["content"] = metavision_discharge_docs["content"].str.replace(
    "\n\n\n", "\n\n"
)
metavision_discharge_docs["content"] = metavision_discharge_docs["content"].str.replace(
    "...", ""
)

generated_discharge_docs = (
    generateddoc.loc[
        (generateddoc["generation_date"] >= PILOT_START_DATE)
        & (generateddoc["generation_date"] <= PILOT_END_DATE)
    ]
    .sort_values("generation_date", ascending=True)
    .drop_duplicates(subset="encounter_id", keep="last")
    .merge(encounter, left_on="encounter_id", right_on="id")
)

merged_discharge_docs = metavision_discharge_docs.merge(
    generated_discharge_docs, left_on="enc_id", right_on="encounter_hix_id"
)
merged_discharge_docs

In [None]:
# Also filter out discharge letters where the generated document was never requested
retrieve_requests = request.loc[
    (request["endpoint"] == "/retrieve_discharge_doc")
    & (request["logging_number"] != "0")
    & (request["timestamp"] >= PILOT_START_DATE)
    & (request["timestamp"] <= PILOT_END_DATE)
].copy()
retrieve_requests["enc_id"] = retrieve_requests["logging_number"].str.extract(
    r"(\d+)_\d+"
)
retrieve_requests = retrieve_requests[retrieve_requests["enc_id"].notnull()]
retrieve_requests["enc_id"] = retrieve_requests["enc_id"].astype(int)
retrieved_encs = retrieve_requests.merge(encounter, left_on="enc_id", right_on="id")[
    "encounter_hix_id"
].unique()

percentage_retrieved = (
    merged_discharge_docs[merged_discharge_docs["enc_id"].isin(retrieved_encs)].shape[0]
    / merged_discharge_docs.shape[0]
)
print(
    f"Percentage of discharge letters that were retrieved: {percentage_retrieved:.2%}"
)

merged_discharge_docs = merged_discharge_docs[
    merged_discharge_docs["enc_id"].isin(retrieved_encs)
].copy()
merged_discharge_docs

## Check differences between generated and Metavision docs

In [None]:
def longest_common_substring(generated_letter: str, original_letter: str) -> str:
    """Finds the longest common substring between two strings
    using dynamic programming."""
    generated_letter = generated_letter.lower()
    original_letter = original_letter.lower()
    m, n = len(generated_letter), len(original_letter)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    max_length = 0
    end_index = 0  # End index of the longest substring in generated_letter

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if generated_letter[i - 1] == original_letter[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                if dp[i][j] > max_length:
                    max_length = dp[i][j]
                    end_index = i

    return generated_letter[end_index - max_length : end_index]


longest_common_substring("dit is een \ntest zin", "is maar een \nTester")

In [None]:
def highlight_lcs(
    df: pd.DataFrame, enc_id: int, col1: str = "content", col2: str = "discharge_letter"
) -> None:
    """retrieves the row for the given encounter id and
    highlights the longest common substring using rich"""
    enc_row = df.loc[df["enc_id"] == enc_id]

    original_letter = enc_row[col1].to_numpy()[0]
    generated_letter = enc_row[col2].to_numpy()[0]

    lcs = longest_common_substring(generated_letter, original_letter)

    # Use rich to highlight the longest common substring in both letters
    replace_pattern = re.compile(re.escape(lcs), re.IGNORECASE)
    original_letter = replace_pattern.sub(
        f"[italic green]{lcs}[/italic green]", original_letter
    )
    generated_letter = replace_pattern.sub(
        f"[italic green]{lcs}[/italic green]", generated_letter
    )
    rprint("[bold yellow]Original letter[/bold yellow]")
    rprint(original_letter)
    rprint("[bold yellow]Generated letter[/bold yellow]")
    rprint(generated_letter)


highlight_lcs(merged_discharge_docs, 8791)


In [None]:
def lcs_distance(generated_letter: str, original_letter: str) -> float:
    """Calculate the longest common substring distance between two strings

    Score of 0 means identical texts, while a score of 1 means no common substrings.
    """
    lcs = longest_common_substring(generated_letter, original_letter)
    longest_text = max(len(generated_letter), len(original_letter))
    return (longest_text - len(lcs)) / longest_text


merged_discharge_docs["lcs_distance"] = merged_discharge_docs.progress_apply(
    lambda x: lcs_distance(x["discharge_letter"], x["content"]), axis=1
)  # type: ignore

In [None]:
def jaccard_distance(generated_letter: str, original_letter: str, n: int) -> float:
    """Calculate the Jaccard distance between two strings using n-grams

    Score of 0 means identical texts, while a score of 1 means no common n-grams.
    """
    generated_letter_words = generated_letter.lower().split()
    original_letter_words = original_letter.lower().split()

    ngrams_generated = set(ngrams(generated_letter_words, n))
    ngrams_original = set(ngrams(original_letter_words, n))

    ngrams_union = ngrams_generated.union(ngrams_original)
    if len(ngrams_union) == 0:
        return 0
    ngrams_intersection = ngrams_generated.intersection(ngrams_original)
    return 1 - len(ngrams_intersection) / len(ngrams_union)


merged_discharge_docs["ngram_1"] = merged_discharge_docs.apply(
    lambda x: jaccard_distance(x["discharge_letter"], x["content"], 1), axis=1
)
merged_discharge_docs["ngram_2"] = merged_discharge_docs.apply(
    lambda x: jaccard_distance(x["discharge_letter"], x["content"], 2), axis=1
)
merged_discharge_docs["ngram_3"] = merged_discharge_docs.apply(
    lambda x: jaccard_distance(x["discharge_letter"], x["content"], 3), axis=1
)
merged_discharge_docs


### Display best matching rows

In [None]:
merged_discharge_docs.sort_values("ngram_3", ascending=True).head(10)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
merged_discharge_docs["ngram_3"].plot.hist(ax=ax, bins=20)
ax.set_title("Jaccard distance between generated and original letters")
ax.set_xlabel("Jaccard distance")
fig.show()

In [None]:
merged_discharge_docs.sort_values("lcs_distance", ascending=True).head(10)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
merged_discharge_docs["lcs_distance"].plot.hist(ax=ax, bins=20)
ax.set_title("Longest common substring distance between generated and original letters")
ax.set_xlabel("LCS distance")
fig.show()

In [None]:
ngram_3_groups = pd.cut(
    merged_discharge_docs["ngram_3"],
    bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1],
)
ngram_3_groups.value_counts().sort_index()


In [None]:
lcs_groups = pd.cut(
    merged_discharge_docs["lcs_distance"],
    bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
)
lcs_groups.value_counts().sort_index()

## Inspect best matching letters

In [None]:
def find_closest_example(
    df: pd.DataFrame,
    threshold_score: float,
    score_col: str = "ngram_3",
    col1: str = "content",
    col2: str = "discharge_letter",
) -> None:
    """Find Closes example to the given threshold score

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing the discharge letters
    ngram_score : float
        The ngram score to use as a threshold
    """
    sorted_df = df.loc[
        df[score_col] > threshold_score, ["enc_id", score_col]
    ].sort_values(score_col, ascending=True)
    enc_id = sorted_df["enc_id"].iat[0]
    actual_score = sorted_df[score_col].iat[0]
    rprint(f"[bold]Encounter ID: {enc_id}, {score_col} score: {actual_score}[/bold]")
    highlight_lcs(df, enc_id, col1, col2)

In [None]:
find_closest_example(merged_discharge_docs, 0)

In [None]:
find_closest_example(merged_discharge_docs, 0.4)

In [None]:
find_closest_example(merged_discharge_docs, 0.5)

In [None]:
find_closest_example(merged_discharge_docs, 0.6)

In [None]:
find_closest_example(merged_discharge_docs, 0.7)

In [None]:
find_closest_example(merged_discharge_docs, 0.8)

In [None]:
find_closest_example(merged_discharge_docs, 0.9)

In [None]:
find_closest_example(merged_discharge_docs, 0.95)

## Check differences Metavision letter and final discharge letter

In [None]:
final_discharge_filtered = (
    final_discharge.sort_values("date", ascending=False)
    .drop_duplicates(subset="enc_id", keep="first")
    .rename(columns={"content": "final_doc"})[["enc_id", "final_doc"]]
)
final_merged_docs = merged_discharge_docs.merge(
    final_discharge_filtered, on="enc_id", how="left"
)
# Somehow some of the final letters contain floats and Nans..
final_merged_docs = final_merged_docs.dropna(subset=["final_doc"])
final_merged_docs["final_doc"] = final_merged_docs["final_doc"].astype(str)
final_merged_docs

In [None]:
def ngram_dist_final(metavision_letter: str, final_letter: str, n: int) -> float:
    """Updated ngram distance that only compares the intersection
    with the metavision letters"""
    metavision_letter_words = metavision_letter.lower().split()
    final_letter_words = final_letter.lower().split()

    ngrams_metavision = set(ngrams(metavision_letter_words, n))
    ngrams_final = set(ngrams(final_letter_words, n))

    ngrams_intersection = ngrams_metavision.intersection(ngrams_final)
    if len(ngrams_metavision) == 0:
        return 1
    return 1 - len(ngrams_intersection) / len(ngrams_metavision)


final_merged_docs["ngram_3_final"] = final_merged_docs.progress_apply(
    lambda x: ngram_dist_final(x["content"], x["final_doc"], 3), axis=1
)  # type: ignore

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
final_merged_docs["ngram_3_final"].plot.hist(ax=ax, bins=20)
ax.set_title("Percentage verschil in 3-grams tussen metavision en laatste brief")
ax.set_xlabel("Percentage verschil")
fig.show()

In [None]:
final_ngram_bins = pd.cut(
    final_merged_docs["ngram_3_final"],
    bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
)
final_ngram_bins.value_counts().sort_index()

In [None]:
find_closest_example(final_merged_docs, 0.4, "ngram_3_final", "content", "final_doc")