# Summarization Hallucination

### Select 40 Random Cases
Use cases from a single test split, stratify by outcome

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../results/split_cases.csv") # requires running a baseline survival experiment to generate this CSV

In [None]:
split_df = df[df["split"] == 0].sort_values("split_order").reset_index(drop=True)
_, sampled = train_test_split(split_df, test_size=40, random_state=24, shuffle=True, stratify=split_df["dead"])

In [None]:
text = pd.read_csv("../data/TCGA_Reports.csv")
summ = pd.read_csv("../data/summarized_reports.csv")

In [None]:
text["case_id"] = text["patient_filename"].str.split(".").str[0]
summ["case_id"] = summ["patient_filename"].str.split(".").str[0]

In [None]:
text_selected = text.set_index("case_id").loc[sampled["case_id"]]
summ_selected = summ.set_index("case_id").loc[sampled["case_id"]]

In [None]:
to_correct = pd.concat([text_selected, summ_selected.rename(columns={"text": "summ"})[["summ"]]], axis=1).reset_index()
to_correct.to_csv("../data/sampled.csv", index=False)

## Manually corrected sampled summaries
Use the [comparison tool](../tools)!

## Merge corrected summaries

To run the survival pipeline with the corrected summaries, it is simpler to merge the corrected summaries with the other summaries, run experiments, and only analyze results over the corrected summaries (done in our [result analysis notebook](../results/analyze-results.ipynb)).

In [None]:
corrected = pd.read_csv("../data/sampled_corrected.csv").set_index("case_id")
print(f"{(corrected['summ'] != corrected['corrected']).sum()} corrections")

In [None]:
# setup for survival experiment pipeline, analyze selected subset separately
summ = summ.set_index("case_id")
summ.loc[corrected.index, "text"] = corrected["corrected"]
summ.to_csv("../data/summarized_reports_corrected.csv")