<a href="https://colab.research.google.com/github/SezarTheGreat/Advanced-Models/blob/main/Custom_Medical_Summarization_ModelV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Ensure you have the required libraries installed.
#!pip install transformers[torch] torch nltk sentence-transformers pandas scikit-learn evaluate rouge_score

In [None]:
# comprehensive_medical_summarizer.py

# Step 1: Installation
# Ensure you have the required libraries installed.
# !pip install transformers[torch] torch nltk sentence-transformers pandas scikit-learn evaluate rouge_score accelerate

import torch
import nltk
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from torch.utils.data import Dataset
import pandas as pd
import json
import evaluate # Using the new evaluate library from Hugging Face
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import re
import os

# Download NLTK's sentence tokenizer model if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Step 2: Data Loading and Preparation
def load_and_prepare_data(file_path):
    """Loads and preprocesses the medical Q&A data from a JSON file."""
    if not os.path.exists(file_path):
        print(f"Error: Data file not found at {file_path}")
        return pd.DataFrame() # Return empty dataframe if file is missing

    processed_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for item in data:
        if 'INFORMATION_SUMMARY' in item.get('labelled_summaries', {}):
            qa_text = item.get('question', '') + " " + " ".join(item.get('answers', []))
            patient_summary = item['labelled_summaries']['INFORMATION_SUMMARY']
            raw_text_sentences = nltk.sent_tokenize(item.get('raw_text', ''))
            # Create a synthetic clinician summary from the more technical raw text
            clinician_summary = " ".join(raw_text_sentences[:3])

            if qa_text.strip() and patient_summary.strip() and clinician_summary.strip():
                processed_data.append({
                    'qa_text': qa_text,
                    'patient_summary': patient_summary,
                    'clinician_summary': clinician_summary
                })
    return pd.DataFrame(processed_data)


# Step 3: Create a Custom Dataset for Fine-Tuning
class MedicalQADataset(Dataset):
    """PyTorch Dataset for perspective-aware summarization."""
    def __init__(self, dataframe, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data) * 2

    def __getitem__(self, idx):
        is_patient_summary = idx % 2 == 0
        original_idx = idx // 2
        row = self.data.iloc[original_idx]
        qa_text = row['qa_text']

        if is_patient_summary:
            prefix = "summarize for patient: "
            summary_text = row['patient_summary']
        else:
            prefix = "summarize for clinician: "
            summary_text = row['clinician_summary']

        input_text = prefix + qa_text
        model_inputs = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(summary_text, max_length=256, padding="max_length", truncation=True, return_tensors="pt").input_ids

        # Replace padding token id in the labels with -100 so it's ignored in the loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {"input_ids": model_inputs["input_ids"].squeeze(), "attention_mask": model_inputs["attention_mask"].squeeze(), "labels": labels.squeeze()}

# Step 4: Fine-Tuning the BART Model
def fine_tune_model():
    """Loads data, sets up, and runs the fine-tuning process to get the best model."""
    # Using a model pre-trained on biomedical data for best performance
    model_name = "GanjinZero/biobart-v2-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    print("Loading and preparing datasets...")
    train_df = load_and_prepare_data('train.json')
    valid_df = load_and_prepare_data('valid.json')

    if train_df.empty or valid_df.empty:
        print("Training or validation data is missing. Aborting fine-tuning.")
        return None, None

    train_dataset = MedicalQADataset(train_df, tokenizer)
    eval_dataset = MedicalQADataset(valid_df, tokenizer)

    # Check for GPU availability and set fp16 accordingly
    is_gpu_available = torch.cuda.is_available()
    if is_gpu_available:
        print("GPU is available. Using FP16 for faster training.")
    else:
        print("GPU not available. Training will run on CPU (this may be slow).")

    training_args = TrainingArguments(
        output_dir="./medical_summarizer_results",
        num_train_epochs=3,
        per_device_train_batch_size=4, # Increased for efficiency
        per_device_eval_batch_size=4,  # Increased for efficiency
        fp16=is_gpu_available, # Enable mixed precision only if GPU is available
        save_steps=500,
        save_total_limit=2,
        logging_dir='./logs',
        logging_steps=50,
        eval_strategy="epoch", # FIX: Changed from evaluation_strategy
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator)

    print("Starting model fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")

    model_path = "fine_tuned_medical_summarizer"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    print(f"Model saved to '{model_path}'")
    return model, tokenizer

# Step 5: Real-time Source Provenance and Credibility
class ProvenanceScorer:
    """Links summary sentences to source sentences and calculates a credibility score."""
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        print("Provenance Scorer initialized with SentenceTransformer model.")

    def score(self, summary_text, source_text, similarity_threshold=0.4):
        """Analyzes a summary against its source text."""
        summary_sentences = nltk.sent_tokenize(summary_text)
        source_sentences = nltk.sent_tokenize(source_text)

        if not summary_sentences or not source_sentences:
            return [], 0.0

        summary_embeddings = self.model.encode(summary_sentences, convert_to_tensor=True)
        source_embeddings = self.model.encode(source_sentences, convert_to_tensor=True)
        cosine_scores = util.cos_sim(summary_embeddings, source_embeddings)

        results = []
        total_score = 0
        for i in range(len(summary_sentences)):
            best_match_score = torch.max(cosine_scores[i]).item()
            best_match_idx = torch.argmax(cosine_scores[i]).item()
            is_supported = best_match_score >= similarity_threshold
            total_score += best_match_score
            results.append({
                "summary_sentence": summary_sentences[i],
                "source_sentence": source_sentences[best_match_idx],
                "credibility_score": round(best_match_score, 4),
                "is_supported": is_supported
            })

        overall_confidence = total_score / len(summary_sentences) if summary_sentences else 0
        return results, round(overall_confidence, 4)

# Step 6: Personalized and Adaptive Inference Function
def generate_summaries(qa_text, model, tokenizer, personalization_options=None):
    """Generates summaries, adapting the prompt based on personalization options."""
    if personalization_options is None:
        personalization_options = {}
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    summaries = {}
    base_prompts = {"patient": "summarize for patient", "clinician": "summarize for clinician"}

    for perspective, base_prompt in base_prompts.items():
        modified_prompt = base_prompt
        if perspective == 'patient':
            focus = personalization_options.get('focus')
            education = personalization_options.get('education')
            if focus:
                modified_prompt += f" with a focus on {focus}"
            if education == 'high_school':
                modified_prompt += " using simple terms for someone with a high school education"

        input_text = modified_prompt + ": " + qa_text
        inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=250, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries[perspective] = summary

    return summaries

# Step 7: Accuracy Calculation
def calculate_accuracy(model, tokenizer, test_df):
    """Calculates ROUGE scores for the test dataset and returns predictions."""
    rouge = evaluate.load("rouge")

    all_predictions = []
    patient_references, clinician_references = [], []

    print("Evaluating model performance on the test set...")
    for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        qa_text = row['qa_text']
        generated = generate_summaries(qa_text, model, tokenizer)
        all_predictions.append(generated)

        patient_references.append(row['patient_summary'])
        clinician_references.append(row['clinician_summary'])

    patient_predictions = [p.get('patient', '') for p in all_predictions]
    clinician_predictions = [p.get('clinician', '') for p in all_predictions]

    print("\n--- Accuracy (ROUGE F1-Score %) for Patient Summaries ---")
    patient_results = rouge.compute(predictions=patient_predictions, references=patient_references)
    print({key: f"{value * 100:.2f}%" for key, value in patient_results.items()})

    print("\n--- Accuracy (ROUGE F1-Score %) for Clinician Summaries ---")
    clinician_results = rouge.compute(predictions=clinician_predictions, references=clinician_references)
    print({key: f"{value * 100:.2f}%" for key, value in clinician_results.items()})

    return all_predictions


if __name__ == '__main__':
    # --- IMPORTANT ---
    # To run training, set this flag to True.
    # After your model is trained and saved, set this to False
    # to load the saved model and run inference without retraining.
    TRAIN_MODEL = True

    model_path = "fine_tuned_medical_summarizer"

    if TRAIN_MODEL:
        # --- Part 1: Fine-tune the model (run this once) ---
        print("TRAIN_MODEL is set to True. Starting the fine-tuning process...")
        fine_tune_model()
    else:
        print("TRAIN_MODEL is set to False. Skipping training and attempting to load existing model.")

    # --- Part 2: Load Model and Run Inference ---
    print("\n--- Loading Fine-Tuned Model for Inference ---")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        print("Successfully loaded fine-tuned model.")
    except OSError:
        print(f"Could not find fine-tuned model at '{model_path}'.")
        if not TRAIN_MODEL:
            print("Please set TRAIN_MODEL = True and run the script to train and save the model first.")
        model = None

    if model:
        # --- Part 3: Calculate Accuracy and Get Predictions ---
        test_df = load_and_prepare_data('test.json')
        if not test_df.empty:
            all_generated_summaries = calculate_accuracy(model, tokenizer, test_df)
        else:
            print("Test data is empty. Skipping accuracy calculation and demonstration.")
            all_generated_summaries = []

        # --- Part 4: Demonstrate Advanced Features using the first test sample ---
        if all_generated_summaries:
            scorer = ProvenanceScorer()

            demo_qa_text = test_df.iloc[0]['qa_text']
            demo_standard_summary = all_generated_summaries[0]['patient']

            print("\n" + "="*80)
            print("Demonstrating Advanced Features on First Item from Test Set")
            print("="*80)
            print("\nOriginal Q&A Text for Demo:")
            print(demo_qa_text)
            print("="*80)

            # DEMO 1: Standard Patient Summary & Provenance
            print("\n\n--- DEMO 1: Standard Patient Summary & Provenance ---")
            print(f"\nGenerated Summary:\n{demo_standard_summary}")

            provenance, confidence = scorer.score(demo_standard_summary, demo_qa_text)
            print(f"\nOverall Summary Confidence: {confidence:.2%}")
            for item in provenance:
                support_status = "✅ Supported" if item['is_supported'] else "⚠️ Not Fully Supported"
                print(f"  - Summary Sentence: '{item['summary_sentence']}'")
                print(f"    - Best Source Match: '{item['source_sentence']}'")
                print(f"    - Credibility: {item['credibility_score']:.2f} ({support_status})")

            # DEMO 2: Personalized Summary (Focus on Treatment) & Provenance
            print("\n\n--- DEMO 2: Personalized Summary (Focus on Treatment) & Provenance ---")
            personalization = {'focus': 'treatment options', 'education': 'high_school'}
            personalized_summaries = generate_summaries(demo_qa_text, model, tokenizer, personalization_options=personalization)
            personalized_summary = personalized_summaries['patient']
            print(f"\nGenerated Personalized Summary:\n{personalized_summary}")

            provenance, confidence = scorer.score(personalized_summary, demo_qa_text)
            print(f"\nOverall Summary Confidence: {confidence:.2%}")
            for item in provenance:
                support_status = "✅ Supported" if item['is_supported'] else "⚠️ Not Fully Supported"
                print(f"  - Summary Sentence: '{item['summary_sentence']}'")
                print(f"    - Best Source Match: '{item['source_sentence']}'")
                print(f"    - Credibility: {item['credibility_score']:.2f} ({support_status})")
            print("\n" + "="*80)

TRAIN_MODEL is set to True. Starting the fine-tuning process...
Loading and preparing datasets...
GPU is available. Using FP16 for faster training.


  trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator)


Starting model fine-tuning...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjyotishmanjbbarman[0m ([33mjyotishmanjbbarman-gautam-buddha-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,1.3736,1.307405
2,1.198,1.274549
3,0.9084,1.293397




Fine-tuning complete.
Model saved to 'fine_tuned_medical_summarizer'

--- Loading Fine-Tuned Model for Inference ---
Successfully loaded fine-tuned model.


Downloading builder script: 0.00B [00:00, ?B/s]

Evaluating model performance on the test set...


100%|██████████| 486/486 [15:12<00:00,  1.88s/it]



--- Accuracy (ROUGE F1-Score %) for Patient Summaries ---
{'rouge1': '39.39%', 'rouge2': '19.53%', 'rougeL': '30.51%', 'rougeLsum': '30.46%'}

--- Accuracy (ROUGE F1-Score %) for Clinician Summaries ---
{'rouge1': '75.05%', 'rouge2': '68.84%', 'rougeL': '73.82%', 'rougeLsum': '73.96%'}


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Provenance Scorer initialized with SentenceTransformer model.

Demonstrating Advanced Features on First Item from Test Set

Original Q&A Text for Demo:
what is orgasm? An orgasm, also known as a sexual climax, is a pleasurable physical, psychological or emotional response to prolonged sexual stimulation. It is often accompanied by a notable physiological reaction, such as ejaculation, blushing or spasm and may be followed by aftershocks.

Dictionaries still give the subsidiary meaning, "a similar point of intensity of emotional excitement," but as of 2005 this usage has become obscure. It can be startling to modern readers when encountered in older literature.


General
Both males and females can experience orgasm, but the exact response varies across gender. Generally speaking, orgasm is the third stage of four in the human sexual response cycle, which is the currently accepted model of the physiological process of sexual stimulation You asked two questions that you might think are th

In [None]:
# comprehensive_medical_summarizer.py

# Step 1: Installation
# Ensure you have the required libraries installed.
# !pip install transformers[torch] torch nltk sentence-transformers pandas scikit-learn evaluate rouge_score accelerate huggingface_hub

import torch
import nltk
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from torch.utils.data import Dataset
import pandas as pd
import json
import evaluate
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import re
import os
from huggingface_hub import login, HfApi

# Download NLTK's sentence tokenizer model if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Step 2: Data Loading and Preparation
def load_and_prepare_data(file_path):
    """Loads and preprocesses the medical Q&A data from a JSON file."""
    if not os.path.exists(file_path):
        print(f"Error: Data file not found at {file_path}")
        return pd.DataFrame()

    processed_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for item in data:
        if 'INFORMATION_SUMMARY' in item.get('labelled_summaries', {}):
            qa_text = item.get('question', '') + " " + " ".join(item.get('answers', []))
            patient_summary = item['labelled_summaries']['INFORMATION_SUMMARY']
            raw_text_sentences = nltk.sent_tokenize(item.get('raw_text', ''))
            clinician_summary = " ".join(raw_text_sentences[:3])

            if qa_text.strip() and patient_summary.strip() and clinician_summary.strip():
                processed_data.append({
                    'qa_text': qa_text,
                    'patient_summary': patient_summary,
                    'clinician_summary': clinician_summary
                })
    return pd.DataFrame(processed_data)


# Step 3: Create a Custom Dataset for Fine-Tuning
class MedicalQADataset(Dataset):
    """PyTorch Dataset for perspective-aware summarization."""
    def __init__(self, dataframe, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data) * 2

    def __getitem__(self, idx):
        is_patient_summary = idx % 2 == 0
        original_idx = idx // 2
        row = self.data.iloc[original_idx]
        qa_text = row['qa_text']

        if is_patient_summary:
            prefix = "summarize for patient: "
            summary_text = row['patient_summary']
        else:
            prefix = "summarize for clinician: "
            summary_text = row['clinician_summary']

        input_text = prefix + qa_text
        model_inputs = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(summary_text, max_length=256, padding="max_length", truncation=True, return_tensors="pt").input_ids

        labels[labels == self.tokenizer.pad_token_id] = -100

        return {"input_ids": model_inputs["input_ids"].squeeze(), "attention_mask": model_inputs["attention_mask"].squeeze(), "labels": labels.squeeze()}

# Step 4: Fine-Tuning the BART Model
def fine_tune_model(hub_model_id=None):
    """Loads data, sets up, runs the fine-tuning process, and optionally pushes to Hub."""
    model_name = "GanjinZero/biobart-v2-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    print("Loading and preparing datasets...")
    train_df = load_and_prepare_data('train.json')
    valid_df = load_and_prepare_data('valid.json')

    if train_df.empty or valid_df.empty:
        print("Training or validation data is missing. Aborting fine-tuning.")
        return None, None

    train_dataset = MedicalQADataset(train_df, tokenizer)
    eval_dataset = MedicalQADataset(valid_df, tokenizer)

    is_gpu_available = torch.cuda.is_available()

    training_args = TrainingArguments(
        output_dir="./medical_summarizer_results",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        fp16=is_gpu_available,
        save_strategy="epoch",
        logging_dir='./logs',
        logging_steps=50,
        evaluation_strategy="epoch",
        push_to_hub=hub_model_id is not None, # Enable push to hub if an ID is provided
        hub_model_id=hub_model_id,
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator)

    print("Starting model fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")

    model_path = "fine_tuned_medical_summarizer"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    print(f"Model saved locally to '{model_path}'")

    if hub_model_id:
        print(f"Uploading model and tokenizer to Hugging Face Hub: {hub_model_id}")
        trainer.push_to_hub()
        print("--- ✅ Model successfully uploaded! ---")

    return model, tokenizer

# ... (ProvenanceScorer, generate_summaries, and calculate_accuracy functions remain the same) ...
class ProvenanceScorer:
    def __init__(self, model_name='all-MiniLM-L6-v2'): self.model = SentenceTransformer(model_name)
    def score(self, summary_text, source_text, similarity_threshold=0.4):
        summary_sentences, source_sentences = nltk.sent_tokenize(summary_text), nltk.sent_tokenize(source_text)
        if not summary_sentences or not source_sentences: return [], 0.0
        summary_embeddings, source_embeddings = self.model.encode(summary_sentences, convert_to_tensor=True), self.model.encode(source_sentences, convert_to_tensor=True)
        cosine_scores = util.cos_sim(summary_embeddings, source_embeddings)
        results, total_score = [], 0
        for i in range(len(summary_sentences)):
            best_match_score, best_match_idx = torch.max(cosine_scores[i]).item(), torch.argmax(cosine_scores[i]).item()
            is_supported = best_match_score >= similarity_threshold
            total_score += best_match_score
            results.append({"summary_sentence": summary_sentences[i], "source_sentence": source_sentences[best_match_idx], "credibility_score": round(best_match_score, 4), "is_supported": is_supported})
        return results, round(total_score / len(summary_sentences) if summary_sentences else 0, 4)
def generate_summaries(qa_text, model, tokenizer, personalization_options=None):
    if personalization_options is None: personalization_options = {}
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    summaries = {}
    base_prompts = {"patient": "summarize for patient", "clinician": "summarize for clinician"}
    for perspective, base_prompt in base_prompts.items():
        modified_prompt = base_prompt
        if perspective == 'patient':
            focus, education = personalization_options.get('focus'), personalization_options.get('education')
            if focus: modified_prompt += f" with a focus on {focus}"
            if education == 'high_school': modified_prompt += " using simple terms"
        inputs = tokenizer(modified_prompt + ": " + qa_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=250, early_stopping=True)
        summaries[perspective] = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summaries
def calculate_accuracy(model, tokenizer, test_df):
    rouge = evaluate.load("rouge")
    all_predictions, patient_references, clinician_references = [], [], []
    print("Evaluating model performance on the test set...")
    for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        generated = generate_summaries(row['qa_text'], model, tokenizer)
        all_predictions.append(generated)
        patient_references.append(row['patient_summary'])
        clinician_references.append(row['clinician_summary'])
    patient_predictions, clinician_predictions = [p.get('patient', '') for p in all_predictions], [p.get('clinician', '') for p in all_predictions]
    print("\n--- Accuracy (ROUGE F1) for Patient Summaries ---")
    print(rouge.compute(predictions=patient_predictions, references=patient_references))
    print("\n--- Accuracy (ROUGE F1) for Clinician Summaries ---")
    print(rouge.compute(predictions=clinician_predictions, references=clinician_references))
    return all_predictions


if __name__ == '__main__':
    # --- Configuration ---
    TRAIN_MODEL = False
    UPLOAD_TO_HUB = True  # SET THIS TO TRUE TO UPLOAD YOUR MODEL
    # IMPORTANT: REPLACE with your Hugging Face username and a model name.
    HUB_MODEL_ID = "SezarTheGreat/medical-Q&A-summarizer"

    if UPLOAD_TO_HUB:
        print("--- Hugging Face Hub Login ---")
        print("You will be prompted to enter your write-access token.")
        login()

    model_to_use, tokenizer_to_use = None, None
    if TRAIN_MODEL:
        model_to_use, tokenizer_to_use = fine_tune_model(
            hub_model_id=HUB_MODEL_ID if UPLOAD_TO_HUB else None
        )

    # --- Load Model and Run Inference ---
    if not model_to_use:
        print("\nLoading model from local files for inference...")
        model_path = "fine_tuned_medical_summarizer"
        try:
            model_to_use = AutoModelForSeq2SeqLM.from_pretrained(model_path)
            tokenizer_to_use = AutoTokenizer.from_pretrained(model_path)
            print("Successfully loaded fine-tuned model.")
        except OSError:
            print(f"Could not find fine-tuned model at '{model_path}'.")
            print("Please set TRAIN_MODEL = True and run the script to train a model first.")

    if model_to_use:
        test_df = load_and_prepare_data('test.json')
        if not test_df.empty:
            calculate_accuracy(model_to_use, tokenizer_to_use, test_df)
        else:
            print("Test data is empty. Skipping accuracy calculation.")

--- Hugging Face Hub Login ---
You will be prompted to enter your write-access token.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


Loading model from local files for inference...
Successfully loaded fine-tuned model.
Evaluating model performance on the test set...


 79%|███████▉  | 383/486 [11:57<02:21,  1.37s/it]