In [1]:
import pandas as pd
import json
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
from datetime import datetime

if torch.cuda.is_available():
    print(f"GPU found, using: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("GPU not found")
    device = torch.device("cpu")

GPU found, using: NVIDIA GeForce RTX 3070


In [None]:
# Take a look at a few different pretrained hugging face models and set up output directories
# MODEL_NAME = 'distilbert-base-uncased'
# MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'
# Small guy
MODEL_NAME = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
# Big fella
# Crashed GPU :(
# MODEL_NAME = 'microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract'
CSV = '../../Data/Specialty-Data/specialty_data.csv'
MAPPINGS = '../../Data/Specialty-Data/specialty_data_label_mappings.json'
WEIGHTS = '../../Data/Specialty-Data/specialty_data_class_weights.json'

current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ROOT_OUTPUT_DIR = f"./Saved-Models/BERT/{MODEL_NAME}/training_run_{current_time}"

TRAINING_OUTPUT_DIRECTORY = os.path.join(ROOT_OUTPUT_DIR, 'results')
MODEL_FINAL_DIRECTORY = os.path.join(ROOT_OUTPUT_DIR, 'final_model')
LOGGING_DIRECTORY = os.path.join(ROOT_OUTPUT_DIR, 'logs')

os.makedirs(TRAINING_OUTPUT_DIRECTORY, exist_ok=True)
os.makedirs(MODEL_FINAL_DIRECTORY, exist_ok=True)
os.makedirs(LOGGING_DIRECTORY, exist_ok=True)

print(f"All outputs will be saved to: {ROOT_OUTPUT_DIR}")

# These functions will be used for the BERT models
def tokenize_function(examples, tokenizer):
    return tokenizer(
        examples['transcription'], 
        padding="max_length", 
        truncation=True,
        max_length=512
    )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, predictions)    
    f1 = f1_score(labels, predictions, average="weighted")
    
    return {"accuracy": acc, "f1": f1}

All outputs will be saved to: ./Saved-Models/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/training_run_2025-11-09_20-14-38


In [3]:
# Retrieve the data
try:
    raw_df = pd.read_csv(CSV)
    with open(MAPPINGS, 'r') as f:
        specialty_and_id_map = json.load(f)
    with open(WEIGHTS, 'r') as f:
        class_weights = json.load(f)
except:
    print(f"Data not found, make sure to run the specialty_data_preprocessing.ipynb file in its entirety to retrieve the data")

# Retrieve labels
label_to_id = specialty_and_id_map['label_to_id']
id_to_label = {int(k): v for k, v in specialty_and_id_map['id_to_label'].items()}

total_specialties = len(label_to_id)

# Format dataframe for model
df = raw_df[['transcription', 'medical_specialty', 'label']].dropna()

df

Unnamed: 0,transcription,medical_specialty,label
0,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,0
1,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,0
2,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,0
3,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,0
4,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,0
...,...,...,...
1258,"EXAM: , Left heart cath, selective coronary an...",Cardiovascular / Pulmonary,0
1259,"INDICATION:, Acute coronary syndrome.,CONSENT...",Cardiovascular / Pulmonary,0
1260,"ANGINA, is chest pain due to a lack of oxygen ...",Cardiovascular / Pulmonary,0
1261,"INDICATION: , Chest pain.,TYPE OF TEST: , Aden...",Cardiovascular / Pulmonary,0


In [4]:
# 80% Train, 10% Validation, 10% Test
train_df, test_val_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df['label'], 
    random_state=0
)

val_df, test_df = train_test_split(
    test_val_df, 
    test_size=0.5, 
    stratify=test_val_df['label'], 
    random_state=0
)

ds = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_df.reset_index(drop=True)),
    'test': Dataset.from_pandas(test_df.reset_index(drop=True))
})

ds

DatasetDict({
    train: Dataset({
        features: ['transcription', 'medical_specialty', 'label'],
        num_rows: 1010
    })
    validation: Dataset({
        features: ['transcription', 'medical_specialty', 'label'],
        num_rows: 126
    })
    test: Dataset({
        features: ['transcription', 'medical_specialty', 'label'],
        num_rows: 127
    })
})

In [5]:
# Create tokenizer based on transcriptions

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenized_ds = ds.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)

tokenized_ds = tokenized_ds.remove_columns(['transcription', 'medical_specialty'])

tokenized_ds

KeyboardInterrupt: 

In [None]:
# Include the below input in the model for Bio_ClinicalBERT to force the use of safetensors vs using insecure load
# May not be necessary on other models
# use_safetensors=True,

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    use_safetensors=True,
    num_labels=total_specialties,
    id2label=id_to_label,
    label2id=label_to_id
)

# 1e-5 too slow for meaningful improvements and 3e-5 is too fast
# Epochs appear to level out after approximately 8, reached peak of 30 epochs
training_args = TrainingArguments(
    output_dir=TRAINING_OUTPUT_DIRECTORY,
    num_train_epochs=8,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOGGING_DIRECTORY,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)

# Can use either default Trainer or create Trainer that punishes the model based on the weights of its mistakes
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Either use default Trainer or WeightedTrainer, same inputs
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# This is where the training is conducted, can retrieve the final model from the printed directory
# They're pretty heavy on storage coming out to 5 gigs a piece
print(f"Training Model")
trainer.train()
print(f"Training Complete")

print("-------------------------------------")

print(f"Evaluating on validation dataset")
test_results = trainer.evaluate(tokenized_ds["test"])

print(f"Validation results")
print(test_results)

with open(f"{TRAINING_OUTPUT_DIRECTORY}/test_results.json", 'w') as f:
    json.dump(test_results, f, indent=4)

print(f"Saving final model to {MODEL_FINAL_DIRECTORY}")
trainer.save_model(MODEL_FINAL_DIRECTORY)
print(f"Model saved")

Training Model


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.6116,1.528371,0.301587,0.225263
2,1.5062,1.268786,0.563492,0.495144
3,1.3021,0.788774,0.801587,0.797971
4,0.7002,0.566705,0.809524,0.799708
5,0.5484,0.616515,0.714286,0.715712
6,0.4581,0.499985,0.769841,0.768132
7,0.4391,0.590941,0.698413,0.697816
8,0.3528,0.53494,0.761905,0.760358
9,0.361,0.561345,0.746032,0.749615
10,0.3661,0.606778,0.738095,0.725029


Training Complete
-------------------------------------
Evaluating on validation dataset


Validation results
{'eval_loss': 0.6727104783058167, 'eval_accuracy': 0.7637795275590551, 'eval_f1': 0.7610530565328271, 'eval_runtime': 11.4727, 'eval_samples_per_second': 11.07, 'eval_steps_per_second': 0.697, 'epoch': 30.0}
Saving final model to ./Saved-Models/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/training_run_2025-11-08_21-35-00\final_model
Model saved
