# trial1

In [None]:
# Install required libraries
!pip install transformers torch pandas pytesseract pillow scikit-learn

import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split # Changed import statement to correct location
import pytesseract
from PIL import Image

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Coll

In [None]:
# Load and preprocess data
df = pd.read_csv('/content/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df.columns = ['text', 'summary']

# Clean text function
def clean_text(text):
    return text.strip().replace('\n', ' ').replace('\r', ' ').replace('  ', ' ')

df['text'] = df['text'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Dataset preparation
def tokenize_data(examples):
    inputs = [doc for doc in examples['text']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df).map(tokenize_data, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_data, batched=True)

# Model initialization
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train
trainer.train()

# Save model
model.save_pretrained('./medical_summarizer')
tokenizer.save_pretrained('./medical_summarizer')

# OCR function
def ocr_to_text(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return clean_text(text)

# Summarization function
def summarize(text, model, tokenizer):
    inputs = tokenizer.encode("summarize: " + text,
                             return_tensors='pt',
                             max_length=512,
                             truncation=True)

    outputs = model.generate(inputs,
                            max_length=150,
                            min_length=40,
                            length_penalty=2.0,
                            num_beams=4,
                            early_stopping=True)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test function
def test_summarizer(input_source):
    if isinstance(input_source, str) and input_source.endswith(('.png', '.jpg', '.jpeg')):
        text = ocr_to_text(input_source)
    else:
        text = input_source

    summary = summarize(text, model, tokenizer)

    print(f"Input Text:\n{text}\n")
    print(f"Generated Summary:\n{summary}")

# Example usage
test_report = df.iloc[0]['text']
test_summarizer(test_report)

# To test with an image
# test_summarizer('/content/report_image.png')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


NameError: name 'Dataset' is not defined

# pips

In [1]:
!pip install transformers==4.51.3 datasets evaluate easyocr
!pip install sentencepiece
!pip install -U scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->easyocr)
  Down

# trial2

In [None]:
import torch
import easyocr
import re
import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [None]:
reader = easyocr.Reader(['en'])

# Normalize medical abbreviations
medical_terms = {
    r'\bh/o\b': 'history of',
    r'\bc/o\b': 'complains of',
    r'\bw/\b': 'with',
    r'\bs/p\b': 'status post'
}

def clean_text(text):
    text = text.strip().lower()
    for abbr, full in medical_terms.items():
        text = re.sub(abbr, full, text)
    return ' '.join(text.split())

# Improved OCR function with paragraph mode
def ocr_to_text(image_path):
    try:
        result = reader.readtext(image_path, detail=0, paragraph=True)
        return clean_text('\n'.join(result))
    except Exception as e:
        print(f"OCR Error: {str(e)}")
        return ""



Progress: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [None]:
def summarize(text, model, tokenizer):
    inputs = tokenizer.encode("summarize: " + text,
                              return_tensors='pt',
                              max_length=512,
                              truncation=True).to(model.device)

    outputs = model.generate(
        inputs,
        max_length=200,
        min_length=50,
        length_penalty=2.0,
        num_beams=6,
        early_stopping=True,
        temperature=0.9,
        repetition_penalty=1.5
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Example test function for a single image
def test_summarizer(image_path):
    text = ocr_to_text(image_path)
    print("üìù Extracted Text:\n", text)
    if text:
        summary = summarize(text, model, tokenizer)
        print("\nüîç Summary:\n", summary)

In [None]:
model_name = "t5-small"  # use t5-base or t5-large if resources allow
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Simulated medical data example
data = {
    "text": [
        "Patient c/o chest pain w/ SOB. H/O hypertension and diabetes. On examination: BP 160/90, HR 90.",
        "Pt h/o asthma, presents w/ cough and wheezing. No fever. Chest exam shows ronchi."
    ],
    "summary": [
        "Patient has chest pain and shortness of breath, with a history of hypertension and diabetes.",
        "Asthmatic patient with cough and wheezing. No fever, ronchi on chest exam."
    ]
}

df = pd.DataFrame(data)
df['text'] = df['text'].apply(clean_text)

train_texts, val_texts = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_texts)
val_dataset = Dataset.from_pandas(val_texts)

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=100,
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshristi-college-mishra[0m ([33mshristi-college-mishra-navrachana-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=5, training_loss=13.298249816894531, metrics={'train_runtime': 230.194, 'train_samples_per_second': 0.022, 'train_steps_per_second': 0.022, 'total_flos': 676709007360.0, 'train_loss': 13.298249816894531, 'epoch': 5.0})

In [None]:
def plot_training(log_path='./logs/events.out.tfevents'):
    try:
        logs = pd.read_json(log_path, lines=True)
        plt.plot(logs['loss'], label='Train Loss')
        plt.plot(logs.get('eval_loss', []), label='Eval Loss')
        plt.legend()
        plt.title("Training Losses")
        plt.show()
    except:
        print("Could not parse training logs")

# If using HuggingFace's logs.json:
# plot_training('./results/trainer_state.json')

In [None]:
# Download and test
!wget -O sample.jpg https://upload.wikimedia.org/wikipedia/commons/2/25/Example_medical_text.jpg
test_summarizer('/content/1.png')

--2025-05-14 10:03:10--  https://upload.wikimedia.org/wikipedia/commons/2/25/Example_medical_text.jpg
Resolving upload.wikimedia.org (upload.wikimedia.org)... 208.80.154.240, 2620:0:861:ed1a::2:b
Connecting to upload.wikimedia.org (upload.wikimedia.org)|208.80.154.240|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-05-14 10:03:10 ERROR 404: Not Found.

üìù Extracted Text:
 nabonal rclcrcukc lab: sedor-ib eod [ ratt frow dclhl ti0 085 e7ol ' aip3tha3240, ros.cm om ovcoowepsthlabs com 3 dn lal pathlals 4& 547 - radha dla ghostics leuei e7l road, jhunsl awas vikas colony yojna-2 nr police chowki; allahabad tet ka^ dtky cmadd hame m. meeraj 0jha collected receued reported 1/8/2017 8:21:00am 1/8/2017 8.37:03am 1/8/2017 6.52:jopm final lab ilo_ 242862201 age 27 years gender: rd by : dr: vk.paiidey male alc status report status test hame results units bio- ref: interval complete blood count (cbc) hemodlobin qcl 13,00 17.00 40.00 50.00 packed volume cv) rbc count




üîç Summary:
 nabonal rclcukc lab: sedor-ib eod [ ratt frow dclhl ti0 085 e7ol'aip3tha3240, ros.cm om ovcoowepsthlabs com 3 dn lal pathlals 4& 547 - radha dla ghostics leuei e7l


# trial3

In [None]:
# --- Install dependencies (for Colab only) ---
# !pip install transformers datasets sklearn easyocr pandas matplotlib

# --- Imports ---
import torch
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

# --- Step 1: Load CSV ---
csv_path = '/content/mtsamples.csv'  # Change to your CSV path
df = pd.read_csv(csv_path)

# --- Step 2: Clean and Normalize Medical Text ---
medical_terms = {
    r'\bh/o\b': 'history of',
    r'\bc/o\b': 'complains of',
    r'\bw/\b': 'with',
    r'\bs/p\b': 'status post',
    r'\bpt\b': 'patient',
    r'\bhtn\b': 'hypertension',
    r'\bdm\b': 'diabetes mellitus'
}

def clean_text(text):
    text = str(text).strip().lower()
    for abbr, full in medical_terms.items():
        text = re.sub(abbr, full, text)
    return ' '.join(text.split())

df = df.dropna(subset=['transcription', 'description'])  # Ensure no NaNs
df['text'] = df['transcription'].apply(clean_text)
df['summary'] = df['description'].apply(clean_text)
df = df[['text', 'summary']]  # Only keep necessary columns

# --- Step 3: Split and Prepare Dataset ---
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# --- Step 4: Load Model & Tokenizer ---
model_name = "t5-small"  # Use 't5-base' or 't5-large' if resources permit
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# --- Step 5: Preprocessing Function ---
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# --- Step 6: Training Arguments ---
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    fp16=torch.cuda.is_available()
)

# --- Step 7: Trainer Setup ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# --- Step 8: Train the Model ---
trainer.train()

# --- Step 9: Summarization Function ---
def summarize(text, model, tokenizer):
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(model.device)
    outputs = model.generate(
        inputs,
        max_length=150,
        min_length=30,
        length_penalty=2.0,
        num_beams=6,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Step 10: Test Summarizer ---
def test_on_sample(index=0):
    raw_text = val_df.iloc[index]['text']
    ground_truth = val_df.iloc[index]['summary']
    print(f"üìÑ Input Text:\n{raw_text[:500]}...\n")
    print(f"‚úÖ Ground Truth Summary:\n{ground_truth}\n")
    generated = summarize(raw_text, model, tokenizer)
    print(f"üîç Generated Summary:\n{generated}\n")

# --- Test a few samples ---
for i in range(3):
    print("="*60)
    test_on_sample(i)

# --- Optional: Plot Loss if available ---
def plot_training(log_dir='./logs'):
    try:
        from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
        import os, glob

        event_path = glob.glob(f"{log_dir}/events.out.tfevents.*")[0]
        ea = EventAccumulator(event_path)
        ea.Reload()
        loss = ea.Scalars('loss')
        steps = [x.step for x in loss]
        values = [x.value for x in loss]
        plt.plot(steps, values)
        plt.title("Training Loss")
        plt.xlabel("Steps")
        plt.ylabel("Loss")
        plt.show()
    except Exception as e:
        print("Could not plot training loss:", e)


Map:   0%|          | 0/3972 [00:00<?, ? examples/s]



Map:   0%|          | 0/994 [00:00<?, ? examples/s]

Step,Training Loss
10,12.0291
20,8.7539
30,5.4487
40,3.0475
50,1.7237
60,1.6501
70,1.1528
80,0.8426
90,0.7026
100,0.4711


KeyboardInterrupt: 

In [4]:
# --- Install dependencies (for Colab only) ---
# !pip install transformers datasets sklearn pandas matplotlib

# --- Imports ---
import torch
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

# --- Step 1: Load and Clean Dataset ---
csv_path = '/content/mtsamples.csv'  # Adjust path if needed
df = pd.read_csv(csv_path)
df = df.dropna(subset=['transcription', 'description'])

# Medical abbreviation expansion
medical_terms = {
    r'\bh/o\b': 'history of',
    r'\bc/o\b': 'complains of',
    r'\bw/\b': 'with',
    r'\bs/p\b': 'status post',
    r'\bpt\b': 'patient',
    r'\bhtn\b': 'hypertension',
    r'\bdm\b': 'diabetes mellitus'
}

def clean_text(text):
    text = str(text).strip().lower()
    for abbr, full in medical_terms.items():
        text = re.sub(abbr, full, text)
    return ' '.join(text.split())

# Apply cleaning
df['text'] = df['transcription'].apply(clean_text)
df['summary'] = df['description'].apply(clean_text)
df = df[['text', 'summary']]

# --- Step 2: Subset and Split ---
df = df.sample(n=2000, random_state=42)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# --- Step 3: Load Pretrained Model ---
model_name = "mrm8488/t5-small-finetuned-common_gen"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# --- Step 4: Preprocessing ---
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(examples["summary"], max_length=100, truncation=True, padding="max_length")["input_ids"]
    labels = [[(token if token != tokenizer.pad_token_id else -100) for token in label] for label in labels]

    model_inputs["labels"] = labels
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# --- Step 5: Training Arguments ---
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    fp16=torch.cuda.is_available()
)

# --- Step 6: Trainer Setup ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# --- Step 7: Train ---
trainer.train()

# --- Step 8: Summarization Function ---
def summarize(text, model, tokenizer):
    model.eval()
    inputs = tokenizer("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=100,
            min_length=20,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Step 9: Test on Sample ---
def test_on_sample(index=0):
    raw_text = val_df.iloc[index]['text']
    ground_truth = val_df.iloc[index]['summary']
    print(f"üìÑ Input Text:\n{raw_text[:500]}...\n")
    print(f"‚úÖ Ground Truth Summary:\n{ground_truth}\n")
    generated = summarize(raw_text, model, tokenizer)
    print(f"üîç Generated Summary:\n{generated}\n")

# --- Run Test ---
for i in range(3):
    print("=" * 60)
    test_on_sample(i)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Step,Training Loss
10,3.0201
20,2.5778
30,1.687
40,1.7777
50,1.7375
60,1.312
70,1.4562
80,1.5076
90,1.4818
100,1.411


üìÑ Input Text:
preoperative diagnosis:, brain tumor left temporal lobe.,postoperative diagnosis:, brain tumor left temporal lobe - glioblastoma multiforme.,operative procedure:,1. left temporal craniotomy.,2. removal of brain tumor.,operating microscope: , stealth.,procedure: , the patient was placed in the supine position, shoulder roll, and the head was turned to the right side. the entire left scalp was prepped and draped in the usual fashion after having being placed in 2-point skeletal fixation. next, we ...

‚úÖ Ground Truth Summary:
left temporal craniotomy and removal of brain tumor.

üîç Generated Summary:
left temporal craniotomy and removal of brain tumor. left supine position, shoulder roll, and the head was turned to the right side. the entire left scalp was prepped and draped in the usual fashion after having being placed in 2-point skeletal fixation.

üìÑ Input Text:
medications: , plavix, atenolol, lipitor, and folic acid.,clinical history: ,this is a 41-year-old ma

In [6]:
# --- Imports ---
import torch
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

# --- Step 1: Load and Clean Dataset ---
csv_path = '/content/mtsamples.csv'  # Adjust path if needed
df = pd.read_csv(csv_path)
df = df.dropna(subset=['transcription', 'description'])

# Medical abbreviation expansion
medical_terms = {
    r'\bh/o\b': 'history of',
    r'\bc/o\b': 'complains of',
    r'\bw/\b': 'with',
    r'\bs/p\b': 'status post',
    r'\bpt\b': 'patient',
    r'\bhtn\b': 'hypertension',
    r'\bdm\b': 'diabetes mellitus'
}

def clean_text(text):
    text = str(text).strip().lower()
    for abbr, full in medical_terms.items():
        text = re.sub(abbr, full, text)
    return ' '.join(text.split())

# Apply cleaning
df['text'] = df['transcription'].apply(clean_text)
df['summary'] = df['description'].apply(clean_text)
df = df[['text', 'summary']]

# --- Step 2: Subset and Split ---
df = df.sample(n=2000, random_state=42)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# --- Step 3: Load Pretrained Model ---
model_name = "mrm8488/t5-small-finetuned-common_gen"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# --- Step 4: Preprocessing ---
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(examples["summary"], max_length=100, truncation=True, padding="max_length")["input_ids"]
    labels = [[(token if token != tokenizer.pad_token_id else -100) for token in label] for label in labels]

    model_inputs["labels"] = labels
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# --- Step 5: Training Arguments ---
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    fp16=torch.cuda.is_available()
)

# --- Step 6: Trainer Setup ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# --- Step 7: Train ---
trainer.train()

# --- Step 7.1: Save the fine-tuned model and tokenizer ---
save_directory = "./saved_t5_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# --- Step 8: Summarization Function ---
def summarize(text, model, tokenizer):
    model.eval()
    inputs = tokenizer("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=100,
            min_length=20,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Step 9: Test on Sample ---
def test_on_sample(index=0):
    raw_text = val_df.iloc[index]['text']
    ground_truth = val_df.iloc[index]['summary']
    print(f"üìÑ Input Text:\n{raw_text[:500]}...\n")
    print(f"‚úÖ Ground Truth Summary:\n{ground_truth}\n")
    generated = summarize(raw_text, model, tokenizer)
    print(f"üîç Generated Summary:\n{generated}\n")

# --- Run Test ---
for i in range(3):
    print("=" * 60)
    test_on_sample(i)

# --- Step 10: Summarization with Saved Model ---
def summarize_with_saved_model():
    print("\nüîÅ Load saved model and summarize new input\n")
    from transformers import T5Tokenizer, T5ForConditionalGeneration

    saved_model_path = "./saved_t5_model"
    tokenizer = T5Tokenizer.from_pretrained(saved_model_path)
    model = T5ForConditionalGeneration.from_pretrained(saved_model_path)
    model = model.to(device)

    user_input = input("üì• Enter raw medical transcription text:\n> ")
    cleaned_input = clean_text(user_input)
    summary = summarize(cleaned_input, model, tokenizer)
    print(f"\nüìù Summary:\n{summary}\n")

# --- Optional: Uncomment below to test user input summarization ---
summarize_with_saved_model()

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Step,Training Loss
10,3.0201
20,2.5778
30,1.687
40,1.7777
50,1.7375
60,1.312
70,1.4562
80,1.5076
90,1.4818
100,1.411


Step,Training Loss
10,3.0201
20,2.5778
30,1.687
40,1.7777
50,1.7375
60,1.312
70,1.4562
80,1.5076
90,1.4818
100,1.411


üìÑ Input Text:
preoperative diagnosis:, brain tumor left temporal lobe.,postoperative diagnosis:, brain tumor left temporal lobe - glioblastoma multiforme.,operative procedure:,1. left temporal craniotomy.,2. removal of brain tumor.,operating microscope: , stealth.,procedure: , the patient was placed in the supine position, shoulder roll, and the head was turned to the right side. the entire left scalp was prepped and draped in the usual fashion after having being placed in 2-point skeletal fixation. next, we ...

‚úÖ Ground Truth Summary:
left temporal craniotomy and removal of brain tumor.

üîç Generated Summary:
left temporal craniotomy and removal of brain tumor. left supine position, shoulder roll, and the head was turned to the right side. the entire left scalp was prepped and draped in the usual fashion after having being placed in 2-point skeletal fixation.

üìÑ Input Text:
medications: , plavix, atenolol, lipitor, and folic acid.,clinical history: ,this is a 41-year-old ma