# 🩺 Clinical Note Generator using FLAN‑T5

In [None]:
# Install required libraries (this will take a bit)
!pip install transformers datasets evaluate rouge-score --quiet


In [None]:
# importing all the stuff i need here
import pandas as pd
import numpy as np
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import evaluate
import torch


In [None]:
# Load main + augmented MTS data (dialogue + summary)
import os
from pathlib import Path

base_path = Path("/mnt/data/clinical-note-summarizer/MTS-Dialog-main")
csv_paths = list(base_path.rglob("*.csv"))

df_list = []
for path in csv_paths:
    try:
        df = pd.read_csv(path)
        if 'dialogue' in df.columns and 'summary' in df.columns:
            df = df[['dialogue', 'summary']].dropna()
            df_list.append(df)
    except:
        pass

df = pd.concat(df_list, ignore_index=True)
df = df.dropna().reset_index(drop=True)
df.head()


In [None]:
# Formatting for flan-t5 input
def preprocess_function(example):
    return {
        "input_text": "summarize: " + example["dialogue"],
        "target_text": example["summary"]
    }

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)
dataset = dataset.map(preprocess_function)


In [None]:
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input = 512
max_target = 128

def tokenize_function(example):
    model_inputs = tokenizer(
        example["input_text"], max_length=max_input, padding="max_length", truncation=True
    )
    labels = tokenizer(
        example["target_text"], max_length=max_target, padding="max_length", truncation=True
    ).input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# trainin now (takes some time on full data)
trainer.train()


In [None]:
rouge = evaluate.load("rouge")

def generate_summary(example):
    inputs = tokenizer("summarize: " + example["dialogue"], return_tensors="pt", truncation=True)
    output = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(output[0], skip_special_tokens=True)

preds = [generate_summary(x) for x in df.sample(20).to_dict(orient="records")]
refs = df.sample(20)["summary"].tolist()

results = rouge.compute(predictions=preds, references=refs)
print("ROUGE-1:", results["rouge1"])
print("ROUGE-L:", results["rougeL"])


In [None]:
# try your own dialogue below 👇
your_input = """Doctor: Hello, what brings you in today?
Patient: I’ve been coughing for a few days, and my chest hurts when I breathe deeply."""

inputs = tokenizer("summarize: " + your_input, return_tensors="pt", truncation=True)
output = model.generate(**inputs, max_new_tokens=100)
print("Generated summary:", tokenizer.decode(output[0], skip_special_tokens=True))
