In [1]:
!pip install -U datasets
!pip install fsspec
!pip install transformers datasets evaluate sentence-transformers faiss-cpu streamlit
!pip install -U transformers
!pip install rouge_score
!pip install -U huggingface_hub
!pip install -U transformers datasets evaluate rouge_score bert_score textstat

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting streamlit
  Downloading streamlit-1.48.0-py3-none-any.whl.metadata (9.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

# Replace with the exact path if it's inside a folder like 'MyDrive/foldername/discharge.csv'
file_path = '/content/drive/MyDrive/mimic-iv-bhc.csv'

# Read the CSV
df = pd.read_csv(file_path)

# # Show the first 10 rows
# print(df.head(10))

In [4]:
import pandas as pd
import re
import html

# Define preprocessing function
def preprocess_row(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # 1. Decode HTML entities
    text = html.unescape(text)

    # 2. Replace smart quotes and dashes
    text = text.replace("’", "'").replace("‘", "'")\
               .replace("“", '"').replace("”", '"')\
               .replace("–", "-").replace("—", "-")

    # 3. Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    # 4. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # 5. Lowercase
    text = text.lower()

    return text

# Apply preprocessing
df["input"] = df["input"].apply(preprocess_row)
df["target"] = df["target"].apply(preprocess_row)

# Display cleaned rows
df[["input", "target"]].sample(3).to_dict(orient="records")

[{'input': '<sex> f <service> medicine <allergies> plaquenil / daypro / atenolol / keppra <attending> ___. <chief complaint> shortness of breath <major surgical or invasive procedure> none <history of present illness> ___ yof with chf, increased symptoms over 2 weeks, came in on nrb. no chest pain, cough, fevers. intially tachypnic around 30 on nrb. nitro drip, crackles, got 40mg iv lasix. ? rll pna got ctx/azith. got asa. bnp pending. 154/68, 83, satting 100% 3l, rr ___. foley in. chest xray looked like pulmonary edema. first set of cardiac enzymes negative <past medical history> 1. non-ischemic cardiomyopathy -tte ___: ef 50-55%, lvh, with now more mod-severe diastolic chf compared to prior echos -cardiac cath ___ with 40% lad stenosis -difficult to manage, is hospitalized for exacerbations often 2. moderate pulmonary htn -3+ tr on ___ echo 3. hypertension 4. paroxysmal atrial fibrillation on coumadin 5. rheumatoid arthritis on methotrexate 6. macrocytic anemia 7. osteoarthritis 8. g

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

df = df.dropna(subset=["input", "target"])

subset_df = df.sample(frac=0.4, random_state=42).reset_index(drop=True)

train_df, val_df = train_test_split(subset_df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


In [6]:
from huggingface_hub import whoami

try:
    user_info = whoami()
    print("✅ Logged in as:", user_info["name"])
except Exception as e:
    print("❌ Not logged in or token expired:", e)

✅ Logged in as: Vidit202


In [7]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from huggingface_hub import HfApi, login
import os

def run_training(model_name: str, hf_repo_id: str, run_name: str, train_dataset, val_dataset , isCheckpoint):
    HfApi().create_repo(
        repo_id=hf_repo_id,
        private=False,
        exist_ok=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    def preprocess_function(examples):
        inputs = [str(x) for x in examples["input"]]
        targets = [str(x) for x in examples["target"]]
        model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"]
        model_inputs["labels"] = labels
        return model_inputs

    tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
    tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    training_args = Seq2SeqTrainingArguments(
        output_dir="./results_bart",
        per_device_train_batch_size=8,
        num_train_epochs=3,
        fp16=True,
        eval_strategy="epoch",
        predict_with_generate=True,
        save_strategy="steps",
        save_steps=1000,
        save_total_limit=2,
        logging_steps=200,
        report_to="wandb",
        run_name=run_name
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    if isCheckpoint :
      trainer.train(resume_from_checkpoint=True)
    else :
      trainer.train()


    model.save_pretrained(hf_repo_id, push_to_hub=True)
    tokenizer.save_pretrained(hf_repo_id, push_to_hub=True)

    print(f"✅ Model pushed publicly to: https://huggingface.co/{hf_repo_id}")


In [None]:
run_training(
    model_name="t5-small",
    hf_repo_id="Vidit202/t5-mimic-summary",
    run_name="t5-mimic-wandb",
    train_dataset=train_dataset,
    val_dataset=val_dataset
)

Map:   0%|          | 0/97211 [00:00<?, ? examples/s]

Map:   0%|          | 0/10802 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
[34m[1mwandb[0m: Currently logged in as: [33mviditgn-21102[0m ([33mviditgn-21102-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
2,2.0317,1.884147
3,1.9751,1.839981
4,1.9467,1.827724


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  .../t5-mimic-summary/model.safetensors:   0%|          |  552kB /  242MB            

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  Vidit202/t5-mimic-summary/spiece.model: 100%|##########|  792kB /  792kB            

✅ Model pushed publicly to: https://huggingface.co/Vidit202/t5-mimic-summary


In [None]:
run_training(
    model_name="facebook/bart-base",
    hf_repo_id="Vidit202/bart-mimic-summary",
    run_name="bart-mimic-wandb",
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    isCheckpoint=False
)

Map:   0%|          | 0/97211 [00:00<?, ? examples/s]

Map:   0%|          | 0/10802 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.9692,1.852442
2,1.7936,1.757909
3,1.7267,1.712669
4,1.6597,1.698678




Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...art-mimic-summary/model.safetensors:   0%|          |  131kB /  558MB            

✅ Model pushed publicly to: https://huggingface.co/Vidit202/bart-mimic-summary


In [8]:
run_training(
    model_name="google/pegasus-pubmed",
    hf_repo_id="Vidit202/pegasus-pubmed-summary",
    run_name="pegasus-wandb",
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    isCheckpoint=False
)

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-pubmed and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Map:   0%|          | 0/97211 [00:00<?, ? examples/s]

Map:   0%|          | 0/10802 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mviditgn-21102[0m ([33mviditgn-21102-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.1326,2.015377
2,1.9862,1.932717
3,1.958,1.911125




Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...us-pubmed-summary/model.safetensors:   0%|          | 4.39MB / 2.28GB            

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pegasus-pubmed-summary/spiece.model:  90%|########9 | 1.71MB / 1.91MB            

✅ Model pushed publicly to: https://huggingface.co/Vidit202/pegasus-pubmed-summary


In [9]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
from evaluate import load
import textstat

def evaluate_model_on_val(model_id: str, val_dataset: Dataset, batch_size=8, max_input_length=512, max_output_length=128):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda")
    model.eval()

    val_texts = [str(x) for x in val_dataset["input"]]
    references = [str(x) for x in val_dataset["target"]]

    dataloader = DataLoader(val_texts, batch_size=batch_size)

    all_preds = []

    for batch in tqdm(dataloader, desc="Generating summaries"):
        inputs = tokenizer(
            batch,
            max_length=max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_output_length,
                num_beams=4,
                early_stopping=True
            )

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_preds.extend(decoded_preds)

    rouge = load("rouge")
    bleu = load("bleu")
    bertscore = load("bertscore")

    rouge_scores = rouge.compute(predictions=all_preds, references=references, use_stemmer=True)
    bleu_score = bleu.compute(predictions=[p.split() for p in all_preds],
                              references=[[r.split()] for r in references])
    bert_scores = bertscore.compute(predictions=all_preds, references=references, lang="en")
    fkgl = np.mean([textstat.flesch_kincaid_grade(s) for s in all_preds])

    return {
        "ROUGE": rouge_scores,
        "BLEU": bleu_score,
        "BERTScore": {
            "precision": np.mean(bert_scores["precision"]),
            "recall": np.mean(bert_scores["recall"]),
            "f1": np.mean(bert_scores["f1"])
        },
        "FKGL": fkgl
    }


In [None]:
from pprint import pprint
print("----------------------T5 Scores----------------------")
pprint(results_t5["ROUGE"])
pprint(results_t5["BLEU"])
pprint(results_t5["BERTScore"])
print("Flesch-Kincaid Grade Level:", results_t5["FKGL"])

In [None]:
results_bart = evaluate_model_on_val("Vidit202/bart-mimic-summary", val_dataset)

In [None]:
from pprint import pprint
print("----------------------BART Scores----------------------")
pprint(results_bart["ROUGE"])
pprint(results_bart["BLEU"])
pprint(results_bart["BERTScore"])
print("Flesch-Kincaid Grade Level:", results_bart["FKGL"])

{'rouge1': np.float64(0.21258452962055818),
 'rouge2': np.float64(0.09582471155607766),
 'rougeL': np.float64(0.15465909661124938),
 'rougeLsum': np.float64(0.15468141422672874)}
{'bleu': 0.004175743690581654,
 'brevity_penalty': 0.012634959888683395,
 'length_ratio': 0.1861750949633928,
 'precisions': [0.6286277183417398,
                0.35294409591582654,
                0.260351742959533,
                0.20652846952592033],
 'reference_length': 5144614,
 'translation_length': 957799}
{'f1': np.float64(0.833877809250538),
 'precision': np.float64(0.8674154216449496),
 'recall': np.float64(0.8035667774479073)}
Flesch-Kincaid Grade Level: 13.080587900688386

Example 1:
Input:
<sex> m <service> neurosurgery <allergies> no known allergies / adverse drug reactions <attending> ___. <chief complaint> back pain <major surgical or invasive procedure> spine stimulator <history of present illness> i had the pleasure of seeing mr. ___ ___ gentleman referred with percutaneous spinal cord stim

In [11]:
results_pegasus = evaluate_model_on_val("Vidit202/pegasus-pubmed-summary", val_dataset)

Generating summaries:   0%|          | 1/1351 [00:10<3:54:57, 10.44s/it]


KeyboardInterrupt: 

In [None]:
from pprint import pprint
print("----------------------Pegasus Scores----------------------")
pprint(results_pegasus["ROUGE"])
pprint(results_pegasus["BLEU"])
pprint(results_pegasus["BERTScore"])
print("Flesch-Kincaid Grade Level:", results_pegasus["FKGL"])

In [12]:
def generate_summaries(model_id: str, val_dataset: Dataset, batch_size=8, max_input_length=512, max_output_length=128):
    from torch.utils.data import DataLoader
    from tqdm import tqdm
    import torch

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda")
    model.eval()

    val_texts = [str(x) for x in val_dataset["input"]]
    references = [str(x) for x in val_dataset["target"]]

    dataloader = DataLoader(val_texts, batch_size=batch_size)
    predictions = []

    for batch in tqdm(dataloader, desc="Generating summaries"):
        inputs = tokenizer(batch, max_length=max_input_length, padding="max_length", truncation=True, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_output_length,
                num_beams=4,
                early_stopping=True
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded)

    return predictions, references, val_texts


In [13]:
def evaluate_predictions(predictions, references, inputs):
    from evaluate import load
    import numpy as np
    import textstat

    # Metrics
    rouge = load("rouge")
    bleu = load("bleu")
    bertscore = load("bertscore")

    rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    bleu_score = bleu.compute(predictions=predictions, references=[[r] for r in references])
    bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")
    fkgl = np.mean([textstat.flesch_kincaid_grade(s) for s in predictions])

    return {
        "ROUGE": rouge_scores,
        "BLEU": bleu_score,
        "BERTScore": {
            "precision": np.mean(bert_scores["precision"]),
            "recall": np.mean(bert_scores["recall"]),
            "f1": np.mean(bert_scores["f1"])
        },
        "FKGL": fkgl,
        "Samples": list(zip(inputs[:3], predictions[:3], references[:3]))
    }


In [None]:
# Generate once
preds, refs, inputs = generate_summaries("Vidit202/t5-mimic-summary", val_dataset)

# Save to reuse later
import pickle
with open("t5_preds.pkl", "wb") as f:
    pickle.dump((preds, refs, inputs), f)


Generating summaries: 100%|██████████| 1351/1351 [43:15<00:00,  1.92s/it]


In [None]:
# Load and evaluate anytime
with open("t5_preds.pkl", "rb") as f:
    preds, refs, inputs = pickle.load(f)

results = evaluate_predictions(preds, refs, inputs)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_names = {
    "t5": "Vidit202/t5-mimic-summary",
    "bart": "Vidit202/bart-mimic-summary",
    "pegasus": "Vidit202/pegasus-pubmed-summary"
}

tokenizers = {k: AutoTokenizer.from_pretrained(v) for k, v in model_names.items()}
models = {k: AutoModelForSeq2SeqLM.from_pretrained(v).to(device) for k, v in model_names.items()}


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

In [16]:
def generate_summary(model, tokenizer, text, max_input_len=512, max_output_len=128):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_input_len).to(device)
    outputs = model.generate(**inputs, max_length=max_output_len)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [17]:
def combined_summarizer(text):
    summaries = []
    for name in ["t5", "bart", "pegasus"]:
        summary = generate_summary(models[name], tokenizers[name], text)
        summaries.append(summary)

    # Combine all three summaries
    merged_summary_input = " ".join(summaries)

    # Re-summarize using T5 for final output
    final_summary = generate_summary(models["t5"], tokenizers["t5"], merged_summary_input)

    return {
        "T5": summaries[0],
        "BART": summaries[1],
        "Pegasus": summaries[2],
        "Combined": final_summary
    }


In [20]:
val_df = val_dataset.to_pandas()

results = []

for idx, row in val_df.iterrows():
    input_text = row["input"]
    reference = row["target"]

    outputs = combined_summarizer(input_text)

    results.append({
        "input": input_text,
        "reference": reference,
        "t5": outputs["T5"],
        "bart": outputs["BART"],
        "pegasus": outputs["Pegasus"],
        "combined": outputs["Combined"]
    })

    if idx >= 10:
        break


In [23]:
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

In [24]:
def evaluate_all_metrics(preds, refs):
    scores = {}

    # ROUGE
    rouge_result = rouge.compute(predictions=preds, references=refs)
    scores.update(rouge_result)

    # BLEU
    bleu_result = bleu.compute(predictions=preds, references=[[r] for r in refs])
    scores['bleu'] = bleu_result['bleu']

    # BERTScore
    bert_result = bertscore.compute(predictions=preds, references=refs, lang="en")
    scores["bertscore_precision"] = sum(bert_result["precision"]) / len(bert_result["precision"])
    scores["bertscore_recall"] = sum(bert_result["recall"]) / len(bert_result["recall"])
    scores["bertscore_f1"] = sum(bert_result["f1"]) / len(bert_result["f1"])
    return scores


In [25]:
t5_preds = [r["t5"] for r in results]
bart_preds = [r["bart"] for r in results]
pegasus_preds = [r["pegasus"] for r in results]
combined_preds = [r["combined"] for r in results]
refs = [r["reference"] for r in results]

metrics = {
    "T5": evaluate_all_metrics(t5_preds, refs),
    "BART": evaluate_all_metrics(bart_preds, refs),
    "Pegasus": evaluate_all_metrics(pegasus_preds, refs),
    "Combined": evaluate_all_metrics(combined_preds, refs)
}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
import pandas as pd
import numpy as np

df_metrics = pd.DataFrame(metrics).T.round(4)
df_metrics.replace({np.nan: "N/A"}, inplace=True)
df_metrics


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu,bertscore_precision,bertscore_recall,bertscore_f1
T5,0.2432,0.1151,0.1841,0.182,0.0058,0.8758,0.8152,0.8441
BART,0.2889,0.1472,0.208,0.2068,0.0171,0.8771,0.8273,0.8513
Pegasus,0.2812,0.1285,0.207,0.2049,0.0176,0.8676,0.8231,0.8445
Combined,0.2197,0.1071,0.1657,0.1677,0.0046,0.8606,0.8073,0.8328


In [27]:
medical_terms = {
    "hypertension": "high blood pressure",
    "hypotension": "low blood pressure",
    "myocardial infarction": "heart attack",
    "cerebrovascular accident": "stroke",
    "dyspnea": "shortness of breath",
    "analgesic": "painkiller",
    "edema": "swelling",
    "febrile": "feverish",
    "gastritis": "stomach inflammation",
    "neoplasm": "tumor",
    "hyperlipidemia": "high cholesterol",
    "renal failure": "kidney failure",
    "hematemesis": "vomiting blood",
    "hematuria": "blood in urine",
    "hematochezia": "blood in stool",
    "tachycardia": "fast heartbeat",
    "bradycardia": "slow heartbeat",
    "arrhythmia": "irregular heartbeat",
    "diaphoresis": "sweating",
    "syncope": "fainting",
    "nausea": "feeling like vomiting",
    "vomitus": "vomit",
    "anemia": "low red blood cells",
    "leukocytosis": "high white blood cells",
    "thrombocytopenia": "low platelets",
    "polyuria": "frequent urination",
    "nocturia": "urinating at night",
    "dysuria": "painful urination",
    "incontinence": "loss of bladder control",
    "hepatomegaly": "enlarged liver",
    "splenomegaly": "enlarged spleen",
    "hepatitis": "liver inflammation",
    "cirrhosis": "liver scarring",
    "osteoporosis": "weak bones",
    "osteoarthritis": "joint wear and tear",
    "rheumatoid arthritis": "joint inflammation",
    "embolism": "blood clot",
    "thrombosis": "blood clot formation",
    "ischemia": "lack of blood flow",
    "necrosis": "tissue death",
    "cyanosis": "bluish skin",
    "jaundice": "yellow skin",
    "pruritus": "itching",
    "urticaria": "hives",
    "eczema": "skin inflammation",
    "dermatitis": "skin irritation",
    "alopecia": "hair loss",
    "melena": "black stool",
    "hemoptysis": "coughing up blood",
    "pleurisy": "lung lining inflammation",
    "pneumonia": "lung infection",
    "bronchitis": "airway inflammation",
    "asthma": "narrowed airways",
    "copd": "chronic lung disease",
    "emphysema": "damaged lung sacs",
    "hypoxia": "low oxygen",
    "apnea": "not breathing",
    "dyspnea on exertion": "shortness of breath with activity",
    "orthopnea": "breathing difficulty when lying down",
    "cyanotic": "blue-colored skin",
    "tachypnea": "fast breathing",
    "bradypnea": "slow breathing",
    "sepsis": "body-wide infection",
    "bacteremia": "bacteria in the blood",
    "pyelonephritis": "kidney infection",
    "cystitis": "bladder infection",
    "nephrolithiasis": "kidney stones",
    "cholelithiasis": "gallstones",
    "cholecystitis": "gallbladder inflammation",
    "pancreatitis": "pancreas inflammation",
    "appendicitis": "appendix inflammation",
    "diverticulitis": "colon pouch inflammation",
    "gastroenteritis": "stomach flu",
    "colitis": "colon inflammation",
    "constipation": "hard stool",
    "diarrhea": "loose stool",
    "abdominal distension": "bloated belly",
    "ascites": "fluid in belly",
    "anorexia": "loss of appetite",
    "cachexia": "wasting away",
    "obesity": "excess body weight",
    "malnutrition": "poor nutrition",
    "dehydration": "lack of fluids",
    "hypoglycemia": "low blood sugar",
    "hyperglycemia": "high blood sugar",
    "diabetes mellitus": "high blood sugar condition",
    "insulin resistance": "body not responding to insulin",
    "hypothyroidism": "low thyroid activity",
    "hyperthyroidism": "overactive thyroid",
    "goiter": "swollen thyroid",
    "dysphagia": "difficulty swallowing",
    "dysphasia": "difficulty speaking",
    "aphasia": "loss of ability to speak",
    "ataxia": "loss of coordination",
    "paresthesia": "tingling or numbness",
    "paralysis": "loss of movement",
    "spasticity": "stiff muscles",
    "seizure": "uncontrolled brain activity",
    "epilepsy": "recurring seizures",
    "headache": "head pain",
    "migraine": "intense headache",
    "encephalopathy": "brain dysfunction",
    "meningitis": "brain lining infection",
    "delirium": "confused thinking",
    "dementia": "memory loss",
    "psychosis": "loss of reality",
    "mania": "extreme mood elevation",
    "depression": "persistent sadness",
    "anxiety": "excessive worry",
    "hallucination": "seeing or hearing things",
    "delusion": "false belief",
    "bipolar disorder": "mood swings",
    "schizophrenia": "chronic mental illness",
    "ptsd": "trauma-related stress",
    "ocd": "repetitive thoughts or actions",
    "insomnia": "difficulty sleeping",
    "narcolepsy": "excessive daytime sleepiness",
    "apnea": "stopping breathing",
    "arrhythmia": "irregular heart rhythm",
    "valvular disease": "heart valve problem",
    "congestive heart failure": "heart pumping problem",
    "cardiomyopathy": "heart muscle disease",
    "pericarditis": "heart lining inflammation",
    "angina": "chest pain",
    "aortic aneurysm": "bulge in the aorta",
    "deep vein thrombosis": "blood clot in leg vein",
    "pulmonary embolism": "clot in lung artery",
    "shock": "dangerously low blood pressure",
    "anaphylaxis": "severe allergic reaction",
    "urticaria": "hives",
    "eczema": "skin inflammation",
    "psoriasis": "scaly skin",
    "seborrheic dermatitis": "flaky scalp",
    "keratosis": "skin bump",
    "melanoma": "skin cancer",
    "basal cell carcinoma": "skin cancer",
    "squamous cell carcinoma": "skin cancer",
    "biopsy": "tissue sample",
    "pathology": "study of disease",
    "benign": "not cancerous",
    "malignant": "cancerous",
    "metastasis": "spread of cancer",
    "oncology": "cancer care",
    "radiotherapy": "radiation treatment",
    "chemotherapy": "cancer drug treatment",
    "immunotherapy": "immune-based treatment",
    "lymphadenopathy": "swollen lymph nodes",
    "tonsillitis": "tonsil infection",
    "sinusitis": "sinus infection",
    "otitis media": "ear infection",
    "conjunctivitis": "pink eye",
    "pharyngitis": "sore throat",
    "laryngitis": "voice box inflammation",
    "bronchiolitis": "small airway inflammation",
    "pneumothorax": "collapsed lung",
    "pleural effusion": "fluid around lungs",
    "atelectasis": "lung collapse",
    "intubation": "inserting breathing tube",
    "extubation": "removing breathing tube",
    "tracheostomy": "neck breathing tube",
    "ventilator": "breathing machine",
    "resuscitation": "reviving from death",
    "cardiac arrest": "heart stops",
    "do not resuscitate": "no revival order",
    "code blue": "medical emergency",
    "advance directive": "care instructions in advance",
    "palliative care": "comfort care",
    "hospice": "end-of-life care",
    "autopsy": "after-death exam",
    "morbidity": "illness",
    "mortality": "death",
    "prognosis": "expected outcome",
    "diagnosis": "identified condition",
    "treatment": "care plan",
    "prescription": "medicine order",
    "dosage": "medicine amount",
    "contraindication": "reason not to use",
    "adverse reaction": "bad effect",
    "side effect": "unintended effect",
    "tolerance": "resistance to drug",
    "dependence": "reliance on drug",
    "withdrawal": "symptoms after stopping",
    "overdose": "too much medicine",
    "toxicity": "poison effect",
    "placebo": "fake treatment",
    "clinical trial": "research study",
    "vital signs": "body measurements",
    "temperature": "body heat",
    "pulse": "heartbeat rate",
    "respiration": "breathing rate",
    "blood pressure": "pressure in arteries"
}


In [28]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

retriever_model = SentenceTransformer("all-MiniLM-L6-v2")

jargon_terms = list(medical_terms.keys())
layman_terms = list(medical_terms.values())

jargon_embeddings = retriever_model.encode(jargon_terms, convert_to_numpy=True)
index = faiss.IndexFlatL2(jargon_embeddings.shape[1])
index.add(jargon_embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [30]:
def simplify_summary(summary, top_k=1):
    words = summary.split()
    simplified = []

    for word in words:
        cleaned = word.strip(",.?!:;()").lower()
        if cleaned in medical_terms:
            simplified.append(medical_terms[cleaned])
            continue

        embedding = retriever_model.encode([cleaned], convert_to_numpy=True)
        D, I = index.search(embedding, top_k)

        if D[0][0] < 0.7:  # optional distance threshold
            simplified.append(layman_terms[I[0][0]])
        else:
            simplified.append(word)

    return " ".join(simplified)


In [31]:
for result in results[:5]:
    print("Original Summary:\n", result["combined"])
    simplified = simplify_summary(result["combined"])
    print("\nSimplified Summary:\n", simplified)
    print("=" * 60)


Original Summary:
 mr. ___ was admitted to the neurosurgery service after undergoing spinal stimulator placement. he tolerated the procedure well and was extubated in the operating room and transferred to the floor in stable condition. he tolerated the procedure well and was extubated in the operating room and transferred to the floor in stable condition. he was transferred to the floor in stable condition. on ___, the patient remained neurologically and hemodynamically stable. on ___, the patient remained neurologically intact and hemodynamically stable. he tolerated

Simplified Summary:
 mr. ___ was admitted to the neurosurgery service after undergoing spinal stimulator placement. he tolerated the procedure well and was removing breathing tube in the operating room and transferred to the floor in stable condition. he tolerated the procedure well and was removing breathing tube in the operating room and transferred to the floor in stable condition. he was transferred to the floor in s

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("Vidit202/bart-mimic-summary")
tokenizer = AutoTokenizer.from_pretrained("Vidit202/bart-mimic-summary")
discharge_note = """Ms. ___ was admitted to the bariatric service with
abdominal pain after being transferred from an OSH with a CT
read of possible small bowel obstruction. Due to her ___ en y
gastric bypass, there was concern of an internal hernia and need
for operative intervention. On arrival, she had a nutritional
IV fluids given ("banana bag") which consisted of thiamine and
Vitamin B12. Stat CBC/chem10 and lactate revealed no etiology
of her abdominal pain. She had normal LFTs, lipase, lactate,
and white count. She was started on an IV BID PPI and IVF and
made NPO. She had a repeat CT abdomen with PO contrast to
better evaluate for a small bowel obstruction. There were no
abnormal findings on the CT scan. Her diet was advanced to
stage III which she tolerated well. Nutrition labs were drawn
which revealed iron deficiency. On questioning, she reported
not following up with a nutritionist and not being aware of
having her vitamin levels drawn by her PCP since her ___ en Y
gastric bypass. The importance of having close nutritional
follow up due to her altered anatomy was emphasized, including
following closely Vitamin B1, B12, iron, vitamin D, and folate.
Her primary care physician ___ was also telephoned and
a message was with left with his office to communicate these
recommendations. She had also been taking NSAIDs in the past
and was unaware of their danger with after a gastric bypass, and
the need to avoid NSAIDs was also reinforced.

On the day of discharge, she was tolerating a stage III
bariatric diet. Her pain was well controlled. She was voiding
freely. She was ambulating independently without assistance.
She will follow up with her PCP in one to two weeks."""

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [None]:
input_only = f"Discharge Summary:\n{discharge_note}\n\nPlease simplify this discharge summary."
inputs = tokenizer(input_only, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=128)
summary_without_rag = tokenizer.decode(output[0], skip_special_tokens=True))



the patient was admitted to the bariatric surgery service on ___ for evaluation and treatment of possible small bowel obstruction. on admission, the patient was afebrile with stable vital signs; pain was well controlled oral pain medication on a prn basis. she was tolerating a stage IIIbariatric diet without nausea or vomiting. her vital signs were stable and she was voiding independently. her diet was advanced to a stage ii diet on ___. she was discharged home on ___ in stable condition with instructions to follow up with her primary care physician.
