In [None]:
import json
import os
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch

from datasets import Dataset as HFDataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# All models used here are public and free from Hugging Face Hub - no API keys needed.
# First-time downloads may take a few minutes depending on connection speed.

In [None]:
import json

# Read the notebook
with open('work.ipynb', 'r', encoding='utf-8') as f:
    notebook = json.load(f)

# Remove the widgets metadata if it exists
if 'metadata' in notebook and 'widgets' in notebook['metadata']:
    del notebook['metadata']['widgets']
    print("Removed widgets metadata")

# Write the cleaned notebook
with open('work.ipynb', 'w', encoding='utf-8') as f:
    json.dump(notebook, f, indent=1, ensure_ascii=False)

print("Notebook cleaned successfully!")

In [None]:

@dataclass
class MedicalTextDataset:
    """
    Container for medical text dataset with normalized label mappings.
    
    I use this to keep train/val/test splits together with their label mappings,
    which makes it easier to pass everything to the training functions.
    """
    train_df: pd.DataFrame
    val_df: pd.DataFrame
    test_df: pd.DataFrame
    label_to_id: Dict[int, int]
    id_to_label: Dict[int, int]


In [None]:
def load_dat_file(file_path: str, has_labels: bool = True) -> pd.DataFrame:
    """
    Load .dat files with tab-separated format. Handles both labeled and unlabeled data.
    
    I split only on the first tab since medical text can contain tabs internally.
    This ensures the label (if present) is correctly separated from the text.
    """
    texts: List[str] = []
    labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            if has_labels:
                parts = line.split("\t", 1)
                if len(parts) != 2:
                    print(f"Warning: Skipping malformed line in {file_path}: {line[:120]}...")
                    continue

                label_str, text = parts
                try:
                    label_int = int(label_str)
                except ValueError:
                    print(f"Warning: Skipping line with non-integer label in {file_path}: {line[:120]}...")
                    continue

                labels.append(label_int)
                texts.append(text.strip())
            else:
                texts.append(line)
                labels.append(-1)

    if has_labels:
        df = pd.DataFrame({"text": texts, "raw_label": labels})
    else:
        df = pd.DataFrame({"text": texts, "raw_label": None})

    return df


In [None]:
def load_medical_text_dataset(
    base_directory: str = ".",
    data_subdirectory: str = "data",
) -> MedicalTextDataset:
    """
    Load and prepare the medical text dataset. Normalizes labels to 0..N-1 for model training,
    splits training data into train/val sets, and saves processed data for reproducibility.
    """
    train_file_path = os.path.join(base_directory, data_subdirectory, "train.dat")
    test_file_path = os.path.join(base_directory, data_subdirectory, "test.dat")

    assert os.path.exists(train_file_path), f"Error: train.dat not found at {train_file_path}"
    assert os.path.exists(test_file_path), f"Error: test.dat not found at {test_file_path}"

    print(f"Loading training data from: {train_file_path}")
    train_df = load_dat_file(train_file_path, has_labels=True)
    print(f"  ✓ Loaded {len(train_df)} training examples")

    print(f"Loading test data from: {test_file_path}")
    # Auto-detect if test file has labels by checking the first line
    with open(test_file_path, "r", encoding="utf-8") as f:
        first_line = ""
        for line in f:
            line = line.strip()
            if line:
                first_line = line
                break

    test_has_labels = False
    if first_line:
        parts = first_line.split("\t", 1)
        if len(parts) == 2:
            try:
                int(parts[0])
                test_has_labels = True
            except ValueError:
                test_has_labels = False

    test_df = load_dat_file(test_file_path, has_labels=test_has_labels)
    print(f"  ✓ Loaded {len(test_df)} test examples")
    if not test_has_labels:
        print(f"  ⚠ Note: Test file has no labels (unlabeled data for prediction)")

    # Build label mapping from training data only to ensure consistency
    train_labels = train_df["raw_label"].dropna().unique()
    unique_labels = sorted([int(l) for l in train_labels if pd.notna(l)])
    print(f"\nUnique raw labels found in training data: {unique_labels}")
    print(f"Total number of classes: {len(unique_labels)}")

    # Normalize labels to 0..N-1 for model compatibility
    # Using Python int to avoid JSON serialization issues with numpy types
    label_to_id = {int(raw): int(idx) for idx, raw in enumerate(unique_labels)}
    id_to_label = {int(idx): int(raw) for raw, idx in label_to_id.items()}

    train_df = train_df.copy()
    test_df = test_df.copy()

    train_df["label"] = train_df["raw_label"].map(label_to_id)

    if test_df["raw_label"].notna().any():
        test_df["label"] = test_df["raw_label"].map(label_to_id)
    else:
        test_df["label"] = None

    print("\nLabel mapping (raw_label -> normalized_id):")
    for raw, idx in label_to_id.items():
        print(f"  {raw} -> {idx}")

    # Stratified split ensures class distribution is maintained in both sets
    print(f"\nSplitting training data into train/validation sets (80/20)...")
    train_split_df = train_df[["text", "label"]].copy()
    train_split_df = train_split_df.reset_index(drop=True)

    train_final_df, val_df = train_test_split(
        train_split_df,
        test_size=0.2,
        random_state=42,
        stratify=train_split_df["label"]
    )

    train_final_df = train_final_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    print(f"  ✓ Training set: {len(train_final_df)} examples")
    print(f"  ✓ Validation set: {len(val_df)} examples")

    if "label" in test_df.columns:
        final_test_df = test_df[["text", "label"]].copy()
    else:
        final_test_df = test_df[["text"]].copy()
        final_test_df["label"] = None

    final_test_df = final_test_df.reset_index(drop=True)

    # Save processed data for reproducibility and later use
    processed_data_dir = os.path.join(base_directory, "processed_data")
    os.makedirs(processed_data_dir, exist_ok=True)

    train_save_path = os.path.join(processed_data_dir, "train_processed.csv")
    val_save_path = os.path.join(processed_data_dir, "val_processed.csv")
    test_save_path = os.path.join(processed_data_dir, "test_processed.csv")
    label_mapping_path = os.path.join(processed_data_dir, "label_mapping.json")

    print(f"\nSaving processed datasets...")
    train_final_df.to_csv(train_save_path, index=False, encoding="utf-8")
    val_df.to_csv(val_save_path, index=False, encoding="utf-8")
    final_test_df.to_csv(test_save_path, index=False, encoding="utf-8")
    print(f"  ✓ Saved training data: {train_save_path} ({len(train_final_df)} examples)")
    print(f"  ✓ Saved validation data: {val_save_path} ({len(val_df)} examples)")
    print(f"  ✓ Saved test data: {test_save_path} ({len(final_test_df)} examples)")

    mapping_data = {
        "label_to_id": {str(k): int(v) for k, v in label_to_id.items()},
        "id_to_label": {str(k): int(v) for k, v in id_to_label.items()},
        "num_classes": int(len(label_to_id))
    }
    with open(label_mapping_path, "w", encoding="utf-8") as f:
        json.dump(mapping_data, f, indent=2)
    print(f"  ✓ Saved label mappings: {label_mapping_path}")

    return MedicalTextDataset(
        train_df=train_final_df,
        val_df=val_df,
        test_df=final_test_df,
        label_to_id=label_to_id,
        id_to_label=id_to_label,
    )


In [None]:
def create_tokenize_function(tokenizer, max_length: int):
    """
    Returns a tokenization function compatible with Hugging Face Dataset.map().
    I use max_length padding to ensure all sequences have the same length for batching.
    """
    def _tokenize(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    return _tokenize

In [None]:
def compute_metrics(eval_pred):
    """
    Computes accuracy from model predictions. Used by the Trainer during evaluation.
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [None]:
def predict_on_test_data(
    model_path: str,
    test_df: pd.DataFrame,
    label_mapping: Dict[int, int],
    max_length: int = 256,
    original_model_name: str = None,
) -> pd.DataFrame:
    """
    Load a trained model and generate predictions on unlabeled test data.
    Handles both direct model paths and checkpoint subdirectories.
    """
    print(f"\n{'=' * 80}")
    print(f"Making predictions on test data using model: {model_path}")
    print("=" * 80)

    print("Loading trained model...")

    # Check for model files in base directory or checkpoint subdirectories
    model_loaded = False
    actual_model_path = model_path

    if os.path.exists(os.path.join(model_path, "pytorch_model.bin")) or \
       os.path.exists(os.path.join(model_path, "model.safetensors")):
        actual_model_path = model_path
        model_loaded = True
    else:
        checkpoint_dirs = [d for d in os.listdir(model_path)
                           if os.path.isdir(os.path.join(model_path, d)) and d.startswith("checkpoint-")]
        if checkpoint_dirs:
            checkpoint_dirs.sort(key=lambda x: int(x.split("-")[1]) if x.split("-")[1].isdigit() else 0, reverse=True)
            actual_model_path = os.path.join(model_path, checkpoint_dirs[0])
            print(f"  ⚠ Model not in base directory, found in: {checkpoint_dirs[0]}")
            model_loaded = True

    try:
        tokenizer = AutoTokenizer.from_pretrained(actual_model_path)
        print("  ✓ Tokenizer loaded")
    except Exception as e:
        if original_model_name:
            print(f"  ⚠ Tokenizer not found, loading from original: {original_model_name}")
            tokenizer = AutoTokenizer.from_pretrained(original_model_name)
        else:
            raise ValueError(f"Could not load tokenizer. Error: {e}")

    if not model_loaded:
        raise OSError(f"Model files not found in {model_path} or any checkpoint subdirectories")

    model = AutoModelForSequenceClassification.from_pretrained(actual_model_path)
    print("  ✓ Model loaded")

    test_hf = HFDataset.from_pandas(test_df[["text"]])

    print(f"Tokenizing test data (max_length={max_length})...")
    tokenize_fn = create_tokenize_function(tokenizer, max_length)
    test_hf = test_hf.map(tokenize_fn, batched=True)
    test_hf = test_hf.remove_columns(["text"])
    test_hf.set_format("torch")
    print("  ✓ Tokenization complete")

    print("Making predictions...")
    trainer = Trainer(model=model)
    predictions = trainer.predict(test_hf)
    predicted_ids = np.argmax(predictions.predictions, axis=-1)

    # Map back to original label space
    id_to_label = {v: k for k, v in label_mapping.items()}
    predicted_labels = [id_to_label.get(int(pid), None) for pid in predicted_ids]

    results_df = test_df.copy()
    results_df["predicted_label_id"] = predicted_ids
    results_df["predicted_label"] = predicted_labels

    print(f"  ✓ Predictions complete: {len(results_df)} examples")

    return results_df


In [None]:
def train_and_evaluate_model(
    model_name: str,
    dataset: MedicalTextDataset,
    output_directory: str,
    num_epochs: int = 5,
    batch_size: int = 8,
    max_length: int = 256,
    early_stopping_patience: int = 3,
) -> Tuple[float, Dict]:
    """
    Fine-tune a pretrained transformer model on the medical text dataset.
    Uses early stopping and saves the best model based on validation accuracy.
    """
    print(f"\n{'=' * 80}")
    print(f"Training model: {model_name}")
    print("=" * 80)

    print(f"Loading tokenizer and model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(dataset.label_to_id)
    )
    print(f"  ✓ Model loaded with {len(dataset.label_to_id)} output classes")

    print("\nConverting DataFrames to Hugging Face Datasets...")
    train_hf = HFDataset.from_pandas(dataset.train_df)
    val_hf = HFDataset.from_pandas(dataset.val_df)
    print(f"  ✓ Training dataset: {len(train_hf)} examples")
    print(f"  ✓ Validation dataset: {len(val_hf)} examples")

    print(f"\nTokenizing datasets (max_length={max_length})...")
    tokenize_fn = create_tokenize_function(tokenizer, max_length)
    train_hf = train_hf.map(tokenize_fn, batched=True)
    val_hf = val_hf.map(tokenize_fn, batched=True)
    print("  ✓ Tokenization complete")

    train_hf = train_hf.remove_columns(["text"])
    val_hf = val_hf.remove_columns(["text"])
    train_hf.set_format("torch")
    val_hf.set_format("torch")

    print(f"\nConfiguring training arguments...")
    training_args = TrainingArguments(
        output_dir=output_directory,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        greater_is_better=True,
        logging_steps=50,
        logging_dir=os.path.join(output_directory, "logs"),
        report_to=[],
    )
    print("  ✓ Training arguments configured")

    print("\nInitializing Trainer (with early stopping)...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_hf,
        eval_dataset=val_hf,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    )
    print("  ✓ Trainer initialized")

    print(f"\n{'=' * 80}")
    print("Starting training...")
    print("=" * 80)
    trainer.train()
    print("  ✓ Training complete")

    print(f"\n{'=' * 80}")
    print("Evaluating model...")
    print("=" * 80)
    eval_results = trainer.evaluate()
    accuracy = float(eval_results.get("eval_accuracy", 0.0))
    print(f"\nEvaluation results for {model_name}:")
    for key, value in eval_results.items():
        print(f"  {key}: {value:.4f}")

    print(f"\n{'=' * 80}")
    print("Generating detailed classification report on validation set...")
    print("=" * 80)
    preds_output = trainer.predict(val_hf)
    preds = np.argmax(preds_output.predictions, axis=-1)
    true_labels = preds_output.label_ids
    print("\nClassification report (per-class metrics):")
    print(classification_report(true_labels, preds, digits=4))

    print(f"\nSaving tokenizer to {output_directory}...")
    tokenizer.save_pretrained(output_directory)
    print("  ✓ Tokenizer saved")

    return accuracy, eval_results

In [None]:
def main():
    """
    Main pipeline: loads data, trains multiple transformer models, and compares their performance.
    I compare both general-purpose and domain-specific models to see which works best for medical text.
    """
    print("=" * 80)
    print("STEP 1: Loading Medical Text Classification Dataset")
    print("=" * 80)
    dataset = load_medical_text_dataset(
        base_directory=".",
        data_subdirectory="data"
    )
    print(f"\n✓ Dataset loaded successfully")
    print(f"  - Training examples: {len(dataset.train_df)}")
    print(f"  - Validation examples: {len(dataset.val_df)}")
    print(f"  - Test examples: {len(dataset.test_df)} (unlabeled, for prediction)")
    print(f"  - Number of classes: {len(dataset.label_to_id)}")

    print("\n" + "=" * 80)
    print("STEP 2: Model Selection")
    print("=" * 80)
    model_names = [
        "bert-base-uncased",
        "distilbert-base-uncased",
        "dmis-lab/biobert-v1.1",
        "emilyalsentzer/Bio_ClinicalBERT",
    ]
    print(f"Models to evaluate: {len(model_names)}")
    for i, name in enumerate(model_names, 1):
        print(f"  {i}. {name}")

    print("\n" + "=" * 80)
    print("STEP 3: Training and Evaluation")
    print("=" * 80)
    results_summary: List[Tuple[str, float]] = []
    results_base_dir = "model_results"

    for model_name in model_names:
        short_name = model_name.split("/")[-1]
        model_output_dir = os.path.join(results_base_dir, short_name)
        os.makedirs(model_output_dir, exist_ok=True)

        accuracy, _ = train_and_evaluate_model(
            model_name=model_name,
            dataset=dataset,
            output_directory=model_output_dir,
            num_epochs=3,
            batch_size=16,
            max_length=256,
            early_stopping_patience=2,
        )

        results_summary.append((model_name, accuracy))
        print(f"\n✓ Completed: {model_name} - Accuracy: {accuracy:.4f}")

    print("\n" + "=" * 80)
    print("STEP 4: Model Comparison Results")
    print("=" * 80)
    print(f"{'Model Name':<45} {'Accuracy':>12}")
    print("-" * 59)
    for name, acc in results_summary:
        print(f"{name:<45} {acc:>12.4f}")

    best_model, best_acc = max(results_summary, key=lambda x: x[1])
    best_model_dir = os.path.join(results_base_dir, best_model.split('/')[-1])

    accuracies = [acc for _, acc in results_summary]
    avg_accuracy = sum(accuracies) / len(accuracies)
    improvement = best_acc - avg_accuracy
    improvement_pct = (improvement / avg_accuracy) * 100 if avg_accuracy > 0 else 0

    print("\n" + "=" * 80)
    print("BEST MODEL")
    print("=" * 80)
    print(f"Model: {best_model}")
    print(f"Validation Accuracy: {best_acc:.4f}")
    print(f"Model directory: {best_model_dir}")
    print(f"\nWhy this model is the best:")
    print(f"  • Highest validation accuracy: {best_acc:.4f}")
    print(f"  • {improvement_pct:.2f}% better than average ({avg_accuracy:.4f})")
    print(f"  • {improvement:.4f} absolute improvement over average")

    if "biobert" in best_model.lower() or "clinical" in best_model.lower():
        print(f"  • This model was pre-trained on biomedical/clinical text,")
        print(f"    making it well-suited for medical text classification tasks")
    elif "distilbert" in best_model.lower():
        print(f"  • This is a smaller, faster model that achieved competitive performance")
    else:
        print(f"  • This is a general-purpose BERT model that performed best on this task")

    print(f"\nAll models have been saved to: {results_base_dir}")


if __name__ == "__main__":
    if not torch.cuda.is_available():
        print("=" * 80)
        print("WARNING: CUDA is not available.")
        print("Training will run on CPU and may be slow.")
        print("=" * 80)
    if torch.cuda.is_available():
        print("=" * 80)
        print("CUDA is available. Training will run on GPU.")
        print("=" * 80)

    main()

CUDA is available. Training will run on GPU.
STEP 1: Loading Medical Text Classification Dataset
Loading training data from: ./data/train.dat
  ✓ Loaded 14438 training examples
Loading test data from: ./data/test.dat
  ✓ Loaded 14442 test examples
  ⚠ Note: Test file has no labels (unlabeled data for prediction)

Unique raw labels found in training data: [1, 2, 3, 4, 5]
Total number of classes: 5

Label mapping (raw_label -> normalized_id):
  1 -> 0
  2 -> 1
  3 -> 2
  4 -> 3
  5 -> 4

Splitting training data into train/validation sets (80/20)...
  ✓ Training set: 11550 examples
  ✓ Validation set: 2888 examples

Saving processed datasets (maintaining order)...
  ✓ Saved training data: ./processed_data/train_processed.csv (11550 examples)
  ✓ Saved validation data: ./processed_data/val_processed.csv (2888 examples)
  ✓ Saved test data: ./processed_data/test_processed.csv (14442 examples)
  ✓ Saved label mappings: ./processed_data/label_mapping.json

✓ Dataset loaded successfully
  - Tr

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model loaded with 5 output classes

Converting DataFrames to Hugging Face Datasets...
  ✓ Training dataset: 11550 examples
  ✓ Validation dataset: 2888 examples (for evaluation)

Tokenizing datasets (max_length=256)...


Map:   0%|          | 0/11550 [00:00<?, ? examples/s]

Map:   0%|          | 0/2888 [00:00<?, ? examples/s]

  ✓ Tokenization complete

Configuring training arguments...
  ✓ Training arguments configured

Initializing Trainer (with early stopping)...
  ✓ Trainer initialized

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8984,0.864698,0.635042
2,0.8294,0.850733,0.644391
3,0.7121,0.859484,0.64162


  ✓ Training complete

Evaluating model...



Evaluation results for bert-base-uncased:
  eval_loss: 0.8507
  eval_accuracy: 0.6444
  eval_runtime: 37.6585
  eval_samples_per_second: 76.6890
  eval_steps_per_second: 4.8060
  epoch: 3.0000

Generating detailed classification report on validation set...

Classification report (per-class metrics):
              precision    recall  f1-score   support

           0     0.7171    0.8610    0.7825       633
           1     0.5178    0.7291    0.6056       299
           2     0.6142    0.5169    0.5614       385
           3     0.6741    0.7967    0.7303       610
           4     0.6239    0.4298    0.5089       961

    accuracy                         0.6444      2888
   macro avg     0.6294    0.6667    0.6377      2888
weighted avg     0.6426    0.6444    0.6326      2888


Saving tokenizer to model_results/bert-base-uncased...
  ✓ Tokenizer saved

✓ Completed: bert-base-uncased - Accuracy: 0.6444

Training model: distilbert-base-uncased
Loading tokenizer and model: distilbert-b

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model loaded with 5 output classes

Converting DataFrames to Hugging Face Datasets...
  ✓ Training dataset: 11550 examples
  ✓ Validation dataset: 2888 examples (for evaluation)

Tokenizing datasets (max_length=256)...


Map:   0%|          | 0/11550 [00:00<?, ? examples/s]

Map:   0%|          | 0/2888 [00:00<?, ? examples/s]

  ✓ Tokenization complete

Configuring training arguments...
  ✓ Training arguments configured

Initializing Trainer (with early stopping)...
  ✓ Trainer initialized

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9143,0.880819,0.633657
2,0.8574,0.857533,0.638158
3,0.7541,0.866073,0.630194


  ✓ Training complete

Evaluating model...



Evaluation results for distilbert-base-uncased:
  eval_loss: 0.8575
  eval_accuracy: 0.6382
  eval_runtime: 18.5379
  eval_samples_per_second: 155.7890
  eval_steps_per_second: 9.7640
  epoch: 3.0000

Generating detailed classification report on validation set...

Classification report (per-class metrics):
              precision    recall  f1-score   support

           0     0.7191    0.8452    0.7771       633
           1     0.5309    0.6890    0.5997       299
           2     0.5945    0.5065    0.5470       385
           3     0.6618    0.8148    0.7303       610
           4     0.6056    0.4266    0.5006       961

    accuracy                         0.6382      2888
   macro avg     0.6224    0.6564    0.6309      2888
weighted avg     0.6331    0.6382    0.6262      2888


Saving tokenizer to model_results/distilbert-base-uncased...
  ✓ Tokenizer saved

✓ Completed: distilbert-base-uncased - Accuracy: 0.6382

Training model: dmis-lab/biobert-v1.1
Loading tokenizer and mo

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model loaded with 5 output classes

Converting DataFrames to Hugging Face Datasets...
  ✓ Training dataset: 11550 examples
  ✓ Validation dataset: 2888 examples (for evaluation)

Tokenizing datasets (max_length=256)...


Map:   0%|          | 0/11550 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Map:   0%|          | 0/2888 [00:00<?, ? examples/s]

  ✓ Tokenization complete

Configuring training arguments...
  ✓ Training arguments configured

Initializing Trainer (with early stopping)...
  ✓ Trainer initialized

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.816,0.805191,0.653393
2,0.7495,0.80461,0.642659
3,0.651,0.833556,0.636773


  ✓ Training complete

Evaluating model...



Evaluation results for dmis-lab/biobert-v1.1:
  eval_loss: 0.8052
  eval_accuracy: 0.6534
  eval_runtime: 37.7689
  eval_samples_per_second: 76.4650
  eval_steps_per_second: 4.7920
  epoch: 3.0000

Generating detailed classification report on validation set...

Classification report (per-class metrics):
              precision    recall  f1-score   support

           0     0.7237    0.8483    0.7811       633
           1     0.5580    0.7726    0.6480       299
           2     0.5700    0.6130    0.5907       385
           3     0.6719    0.8361    0.7451       610
           4     0.6673    0.3881    0.4908       961

    accuracy                         0.6534      2888
   macro avg     0.6382    0.6916    0.6511      2888
weighted avg     0.6563    0.6534    0.6377      2888


Saving tokenizer to model_results/biobert-v1.1...
  ✓ Tokenizer saved

✓ Completed: dmis-lab/biobert-v1.1 - Accuracy: 0.6534

Training model: emilyalsentzer/Bio_ClinicalBERT
Loading tokenizer and model: e

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model loaded with 5 output classes

Converting DataFrames to Hugging Face Datasets...
  ✓ Training dataset: 11550 examples
  ✓ Validation dataset: 2888 examples (for evaluation)

Tokenizing datasets (max_length=256)...


Map:   0%|          | 0/11550 [00:00<?, ? examples/s]

Map:   0%|          | 0/2888 [00:00<?, ? examples/s]

  ✓ Tokenization complete

Configuring training arguments...
  ✓ Training arguments configured

Initializing Trainer (with early stopping)...
  ✓ Trainer initialized

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8817,0.821043,0.652701
2,0.7779,0.826772,0.640928
3,0.6732,0.840457,0.637119


  ✓ Training complete

Evaluating model...



Evaluation results for emilyalsentzer/Bio_ClinicalBERT:
  eval_loss: 0.8210
  eval_accuracy: 0.6527
  eval_runtime: 37.9181
  eval_samples_per_second: 76.1640
  eval_steps_per_second: 4.7730
  epoch: 3.0000

Generating detailed classification report on validation set...

Classification report (per-class metrics):
              precision    recall  f1-score   support

           0     0.7266    0.8357    0.7774       633
           1     0.5365    0.7625    0.6298       299
           2     0.5773    0.6597    0.6158       385
           3     0.6839    0.8230    0.7470       610
           4     0.6631    0.3871    0.4888       961

    accuracy                         0.6527      2888
   macro avg     0.6375    0.6936    0.6518      2888
weighted avg     0.6569    0.6527    0.6381      2888


Saving tokenizer to model_results/Bio_ClinicalBERT...
  ✓ Tokenizer saved

✓ Completed: emilyalsentzer/Bio_ClinicalBERT - Accuracy: 0.6527

STEP 4: Model Comparison Results
Model Name           