In [2]:

# STEP 1: Install dependencies

!pip install transformers datasets sentencepiece -q


# STEP 2: Import Libraries

import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


# STEP 3: Load Your Dataset


df = pd.read_csv("ehr_notes.csv")

# STEP 4: Split and Convert to HuggingFace Dataset

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


# STEP 5: Load T5 Tokenizer & Model

model_name = "t5-small"  # you can try "t5-base" for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)


# STEP 6: Preprocessing Function

max_input_length = 512
max_target_length = 128

def preprocess(example):
    input_text = "summarize: " + example["text"]
    inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_input_length)
    targets = tokenizer(example["summary"], padding="max_length", truncation=True, max_length=max_target_length)

    inputs["labels"] = targets["input_ids"]
    return inputs

train_dataset = train_dataset.map(preprocess, batched=False)
val_dataset = val_dataset.map(preprocess, batched=False)

# Remove unnecessary columns
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# STEP 7: TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./t5_ehr_summary",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=100
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


# STEP 8: Train the Model

trainer.train()




def summarize(text):
    input_ids = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, padding="max_length", max_length=512).input_ids.to(device)
    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# STEP 10: Test Summarization

sample_text = df.iloc[0]['text']
print("Original EHR Text:\n", sample_text)
print("\nGenerated Summary:\n", summarize(sample_text))


# STEP 11: Save Model

model.save_pretrained("./t5_ehr_summary_model")
tokenizer.save_pretrained("./t5_ehr_summary_model")


Using device: cpu


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]



Step,Training Loss


Original EHR Text:
 Patient admitted with chest pain. Hx of HTN and DM. BP 150/95. Prescribed aspirin and beta-blocker.

Generated Summary:
 patient admitted with chest pain. aspirin and beta-blocker.


('./t5_ehr_summary_model\\tokenizer_config.json',
 './t5_ehr_summary_model\\special_tokens_map.json',
 './t5_ehr_summary_model\\spiece.model',
 './t5_ehr_summary_model\\added_tokens.json')