In [4]:
import pandas as pd
import json
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import matplotlib.pyplot as plt
import os
from datetime import datetime

if torch.cuda.is_available():
    print(f"GPU found, using: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("GPU not found")
    device = torch.device("cpu")

GPU found, using: NVIDIA GeForce RTX 3070


In [5]:
# MODEL_NAME = 'distilbert-base-uncased'
MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'
CSV = '../../Data/Specialty-Data/specialty_data.csv'
MAPPINGS = '../../Data/Specialty-Data/specialty_data_label_mappings.json'

current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ROOT_OUTPUT_DIR = f"./Saved-Models/{MODEL_NAME}/training_run_{current_time}"

TRAINING_OUTPUT_DIRECTORY = os.path.join(ROOT_OUTPUT_DIR, 'results')
MODEL_FINAL_DIRECTORY = os.path.join(ROOT_OUTPUT_DIR, 'final_model')
LOGGING_DIRECTORY = os.path.join(ROOT_OUTPUT_DIR, 'logs')

os.makedirs(TRAINING_OUTPUT_DIRECTORY, exist_ok=True)
os.makedirs(MODEL_FINAL_DIRECTORY, exist_ok=True)
os.makedirs(LOGGING_DIRECTORY, exist_ok=True)

print(f"All outputs will be saved to: {ROOT_OUTPUT_DIR}")

def tokenize_function(examples, tokenizer):
    return tokenizer(
        examples['transcription'], 
        padding="max_length", 
        truncation=True,
        max_length=512
    )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1 = f1_score(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    
    return {"accuracy": acc, "f1": f1}

All outputs will be saved to: ./Saved-Models/emilyalsentzer/Bio_ClinicalBERT/training_run_2025-11-08_17-11-59


In [6]:
try:
    df = pd.read_csv(CSV)
    with open(MAPPINGS, 'r') as f:
        specialty_and_id_map = json.load(f)
except:
    print(f"Data not found, make sure to run the specialty_data_preprocessing.ipynb file in its entirety to retrieve the data")

label_to_id = specialty_and_id_map['label_to_id']
id_to_label = {int(k): v for k, v in specialty_and_id_map['id_to_label'].items()}

total_specialties = len(label_to_id)

In [7]:
# 80% Train, 10% Validation, 10% Test
train_df, test_val_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df['label'], 
    random_state=0
)

val_df, test_df = train_test_split(
    test_val_df, 
    test_size=0.5, 
    stratify=test_val_df['label'], 
    random_state=0
)

ds = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_df.reset_index(drop=True)),
    'test': Dataset.from_pandas(test_df.reset_index(drop=True))
})

ds

DatasetDict({
    train: Dataset({
        features: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords', 'label'],
        num_rows: 2304
    })
    validation: Dataset({
        features: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords', 'label'],
        num_rows: 288
    })
    test: Dataset({
        features: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords', 'label'],
        num_rows: 288
    })
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenized_ds = ds.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)

tokenized_ds = tokenized_ds.remove_columns(['transcription', 'medical_specialty'])

tokenized_ds

Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['description', 'sample_name', 'keywords', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2304
    })
    validation: Dataset({
        features: ['description', 'sample_name', 'keywords', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 288
    })
    test: Dataset({
        features: ['description', 'sample_name', 'keywords', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 288
    })
})

In [None]:
# Include this in the model for Bio_ClinicalBERT to force the use of safetensors vs using insecure load
# May not be necessary on other models
# use_safetensors=True,

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    use_safetensors=True,
    num_labels=total_specialties,
    id2label=id_to_label,
    label2id=label_to_id
)

training_args = TrainingArguments(
    output_dir=TRAINING_OUTPUT_DIRECTORY,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGGING_DIRECTORY,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [10]:
print(f"Training Model")
trainer.train()
print(f"Training Complete")

print("-------------------------------------")

print(f"Evaluating on validation dataset")
test_results = trainer.evaluate(tokenized_ds["test"])

print(f"Validation results")
print(test_results)

with open(f"{TRAINING_OUTPUT_DIRECTORY}/test_results.json", 'w') as f:
    json.dump(test_results, f, indent=4)

print(f"Saving final model to {MODEL_FINAL_DIRECTORY}")
trainer.save_model(MODEL_FINAL_DIRECTORY)
print(f"Model saved")

Training Model


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.9002,1.596799,0.493056,0.353606
2,1.2827,1.097798,0.489583,0.458143
3,1.0372,1.088696,0.482639,0.432584


Training Complete
-------------------------------------
Evaluating on validation dataset


Validation results
{'eval_loss': 1.1599992513656616, 'eval_accuracy': 0.4791666666666667, 'eval_f1': 0.4499210145571289, 'eval_runtime': 3.7312, 'eval_samples_per_second': 77.187, 'eval_steps_per_second': 4.824, 'epoch': 3.0}
Saving final model to ./Saved-Models/emilyalsentzer/Bio_ClinicalBERT/training_run_2025-11-08_17-11-59\final_model
Model saved
