# Task 3: Pre-trained transformers

## Aim
In this task, the aim is to train different algorithm to be able to classify correctly our medical transcritped notes. Classifcations are labels directly extracted from argilla dataset, as shown in task 1 (e.g. surgery, orthopedics, ...)

In [2]:
import numpy as np
import sklearn
import matplotlib
import transformers
import pandas as pd
import tqdm
import torch
import spacy
import nltk
import evaluate


spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


## 1. Dataset import

We re-use code from task 1 to import our argilla dataset, where we will only keep the text and the labels.

In [3]:
import sys, site
print(sys.executable)
print("USER_SITE:", site.getusersitepackages())
print("sys.path[0:5]:", sys.path[:5])

/usr/bin/python3
USER_SITE: /root/.local/lib/python3.12/site-packages
sys.path[0:5]: ['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload']


In [4]:

pd.set_option('display.max_colwidth', 200)

df = pd.read_parquet("hf://datasets/argilla/medical-domain/data/train-00000-of-00001-67e4e7207342a623.parquet")

def extract_label(pred):
    if isinstance(pred, (list, np.ndarray)) and len(pred) > 0 and isinstance(pred[0], dict):
        return pred[0].get("label")
    return None

df['label'] = df['prediction'].apply(extract_label)
df['text_length'] = df['metrics'].apply(lambda x: x.get('text_length') if isinstance(x, dict) else None)

# drop empty columns
df = df.drop(columns=['inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'metadata', 'status', 'event_timestamp', 'metrics'], errors='ignore')

#print(df.head)
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,text,id,label,text_length
0,"PREOPERATIVE DIAGNOSIS:, Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:, Diverticulosis.,PROCEDURE:, Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope w...",00001265-03e2-47b2-b6cf-bed32dad2fa9,Gastroenterology,1085
1,"CLINICAL INDICATION: ,Normal stress test.,PROCEDURES PERFORMED:,1. Left heart cath.,2. Selective coronary angiography.,3. LV gram.,4. Right femoral arteriogram.,5. Mynx closure device.,PROCE...",0007edf0-1413-4b16-8212-3a13c2ab4e43,Surgery,1798
2,"FINDINGS:,Axial scans were performed from L1 to S2 and reformatted images were obtained in the sagittal and coronal planes.,Preliminary scout film demonstrates anterior end plate spondylosis at T1...",00097d1e-1357-4447-a39a-fe8f8b7c36ae,Radiology,1141
3,"PREOPERATIVE DIAGNOSIS: , Blood loss anemia.,POSTOPERATIVE DIAGNOSES:,1. Diverticulosis coli.,2. Internal hemorrhoids.,3. Poor prep.,PROCEDURE PERFORMED:, Colonoscopy with photos.,ANESTHESIA: ...",001622b6-0182-4fee-9881-ae15e81ce836,Surgery,1767
4,"REASON FOR VISIT: ,Elevated PSA with nocturia and occasional daytime frequency.,HISTORY: , A 68-year-old male with a history of frequency and some outlet obstructive issues along with irritative ...",0029245f-8b45-4796-ba09-7760612289c6,SOAP / Chart / Progress Notes,1519


## 2. Baseline ML algorithms

We will try the 3 propopsed algorithms ( linear regression, linear SVM and XGboost) and pick the best performing one.

In [5]:
###################################
#0. Split data set into train/test
#################################
# This code is inspired from : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

from sklearn.model_selection import train_test_split
X=df["text"]
y=df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y) # I split the text : 80% training, 20% test

# Comment from marc
############################
# maybe we could include: train_test_split(stratify=y), this makes sure that each train and test appear in roughly the same proportions
############################

############################
# 1. TF-IFD
############################

# Using sklearn TfidfVectorizer, we can directly pre-processed our text:
# - everything in lowercase
# - tokenize words
# - every feature of same length

# We finally return the inverse frequency of each token according to all documents.

## This code is adapted from https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents="unicode", # I want to strip all accents
                             lowercase=True,  # I want everything lowercase
                             stop_words="english", # I want to delete common stop words in english
                             min_df=5,  # I want words to be at least in 5 documents
                             max_df=0.8, # very frequent words are not useful to distinguish between documents
                             ngram_range=(1,2) # include also pairs of words
                             )

# Comment from marc
############################
# maybe we could include: TfidfVectorizer(ngram_range=(1,2)), include also pairs of words
############################

X_train = vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test) # I transform X_test according to X_train frequency per document over apperance in every documents

### 2.2 Linear SVM

In [62]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

SVM=LinearSVC(random_state=0, tol=1e-5,class_weight="balanced")
SVM.fit(X_train,y_train)

SVM.score(X_test,y_test) # Accuracy

f1_score_macro_SVM=f1_score(y_test, SVM.predict(X_test), average='macro') # Macro F_1 score -->"harmonic mean of the precision and recall" https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
print("F1 score macro SVM: ",f1_score_macro_SVM)

# Comment from marc
############################
# the two changes i suggested improve the f1 score from 0.164 to 0.215
############################

F1 score macro SVM:  0.21555928764956586


### 2.3 Logistic regression

In [63]:
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression(random_state=0, tol=1e-5,class_weight="balanced") # we have 40 categories, but some are over-represented. Therefore, we balanced
                                                                 # weights according to their initial frequency in training set
LR.fit(X_train,y_train)

LR.score(X_test,y_test)

f1_score_macro_LR=f1_score(y_test, LR.predict(X_test), average='macro')
print("F1 score macro LR: ",f1_score_macro_LR)

# Comment from marc
############################
# the two changes i suggested improve the f1 score from 0.394 to 0.397
############################

F1 score macro LR:  0.39784195329246


### 2.4 XGBoost

Considering the high dimensionality of our data , XGboost takes too much time to run and SVM or LR are already strong baseline ML algorithm to compare our transformers to.

## 3. Encoder task

#### Model specification

We decided to use MedBERT Model. This is an encoder transformer, pre-trained for  NER. We will use it for classifcation task.

In [64]:
import sys, torch
print("python:", sys.executable)
print("torch version:", torch.__version__)
print("torch file:", torch.__file__)

python: /usr/bin/python3
torch version: 2.9.0+cu126
torch file: /usr/local/lib/python3.12/dist-packages/torch/__init__.py


In [65]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed

set_seed(42)

tokenizer = AutoTokenizer.from_pretrained("Charangan/MedBERT")
model = AutoModelForSequenceClassification.from_pretrained("Charangan/MedBERT",num_labels=40)

# This code is adapted from https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com

# I am freezing the encoder, but allowing to update weights of the classification head
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Charangan/MedBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### dataset formatting

In [66]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.select_columns(['text', 'label'])

# Labels are string, I need to change them as numbers.
labels=dataset.unique("label")

# I create a dictionnary that take label as key and return a value.
# I HAVE ASKED CHATGPT TO WRITE THE DICTIONNARY, AS IT JUST REPETITIVE AND LONG

label2id = {
    "Gastroenterology": 0,
    "Surgery": 1,
    "Radiology": 2,
    "SOAP / Chart / Progress Notes": 3,
    "Letters": 4,
    "Lab Medicine - Pathology": 5,
    "Consult - History and Phy.": 6,
    "Podiatry": 7,
    "General Medicine": 8,
    "Psychiatry / Psychology": 9,
    "Cardiovascular / Pulmonary": 10,
    "Urology": 11,
    "Ophthalmology": 12,
    "Physical Medicine - Rehab": 13,
    "Neurology": 14,
    "Autopsy": 15,
    "Orthopedic": 16,
    "Hematology - Oncology": 17,
    "Allergy / Immunology": 18,
    "Pediatrics - Neonatal": 19,
    "Dentistry": 20,
    "Neurosurgery": 21,
    "Pain Management": 22,
    "Nephrology": 23,
    "Emergency Room Reports": 24,
    "Obstetrics / Gynecology": 25,
    "Speech - Language": 26,
    "Diets and Nutritions": 27,
    "Endocrinology": 28,
    "IME-QME-Work Comp etc.": 29,
    "Cosmetic / Plastic Surgery": 30,
    "Discharge Summary": 31,
    "ENT - Otolaryngology": 32,
    "Chiropractic": 33,
    "Office Notes": 34,
    "Dermatology": 35,
    "Sleep Medicine": 36,
    "Rheumatology": 37,
    "Hospice - Palliative Care": 38,
    "Bariatrics": 39,
}

# function for matching key to values
# Map will gives me one row of my dataset, into a dictionnary form.
# So i want to :
# 1) extract label value from dictionnary
# 2) replace it using my dictionnary with a numerical value
def matching(example):
    label=example["label"].strip() # labels have a whitespace as first character, that i strip
    example["label"]=label2id[label]
    return example

dataset=dataset.map(matching)


Map:   0%|          | 0/4966 [00:00<?, ? examples/s]

In [67]:
from datasets import load_dataset
from datasets import ClassLabel
from datasets import DatasetDict

dataset = dataset.rename_column("label", "labels") # for trainer wrappers, i need to rename label as labels

# Comment from marc
############################
# again we could include the stratification, add this line: dataset = dataset.cast_column("labels", ClassLabel(num_classes=40)) and this: train_test_split(stratify_by_column="labels")
# including stratification imporves the f1 score before finetuning from 0.02 to 0.25
############################

dataset = dataset.cast_column("labels", ClassLabel(num_classes=40))

#final_df=dataset.train_test_split(test_size=0.2) # 80/20 split
#final_df=dataset.train_test_split(test_size=0.2, stratify_by_column="labels") # 80/20 split

# train/test=80/20
splits = dataset.train_test_split(
    test_size=0.2,
    stratify_by_column="labels",
    seed=42
)

# split train into train/validation
train_val = splits["train"].train_test_split(
    test_size=0.125,   # 12.5% of 80% = 10% overall
    stratify_by_column="labels",
    seed=42
)

# single DatasetDict with 3 splits 70/10/20=train/val/test
final_df = DatasetDict({
    "train": train_val["train"],
    "validation": train_val["test"],
    "test": splits["test"]
})

### Now, we need to tokenize our data set. Adapted from: https://huggingface.co/docs/datasets/use_dataset

def tokenization(example):
    return tokenizer(example["text"], truncation=True, max_length=512) # i will truncate every exmaple that are longer than 512  token. This is
                                                                       # the max input size of our model

final_df_tokenized = final_df.map(tokenization, batched=True)


final_df_tokenized.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

Casting the dataset:   0%|          | 0/4966 [00:00<?, ? examples/s]

Map:   0%|          | 0/3475 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/994 [00:00<?, ? examples/s]

In [68]:
print(final_df)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 3475
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 497
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 994
    })
})


#### Define testing metrics (accuracy, f1 macro)

In [69]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

#### training arguments

In [70]:
# This code is adapted from : https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com
from transformers import TrainingArguments
from transformers import set_seed

set_seed(42)

training_args = TrainingArguments(
    output_dir='.',          # output directory
    num_train_epochs=3,              # total # of training epochs --> small, as we only train the head
    per_device_train_batch_size=8,  # batch size per device during training --> small, as i run that on CPU only architecture
    per_device_eval_batch_size=16,   # batch size for evaluation --> small, as i run that on CPU only architecture
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    push_to_hub=False, #True
    data_seed=42,
    seed=42,
)


#### training loop

In [71]:
# This code is adapted from : https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com
from transformers import Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=final_df_tokenized["train"],         # training dataset
    eval_dataset= final_df_tokenized["validation"],          # evaluation dataset
    data_collator=data_collator, # allows dynamical padding --> every batch will have the same lenghts, which is max_length of this batch
    compute_metrics=compute_metrics # added to return f1
)

trainer.train()

Step,Training Loss
500,3.1343
1000,2.85


TrainOutput(global_step=1305, training_loss=2.9637512440882423, metrics={'train_runtime': 136.3884, 'train_samples_per_second': 76.436, 'train_steps_per_second': 9.568, 'total_flos': 2743868604211200.0, 'train_loss': 2.9637512440882423, 'epoch': 3.0})

#### Evalute accuracy

In [72]:
trainer.evaluate(eval_dataset=final_df_tokenized["test"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.835954189300537,
 'eval_accuracy': 0.26156941649899396,
 'eval_f1': 0.01907473864877516,
 'eval_precision': 0.020719696969696964,
 'eval_recall': 0.036430318145380045,
 'eval_runtime': 8.182,
 'eval_samples_per_second': 121.486,
 'eval_steps_per_second': 7.7,
 'epoch': 3.0}

### Fine-tunning

We will fine-tune our model based on: https://huggingface.co/learn/llm-course/en/chapter3/3

In [73]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed

set_seed(42)

tokenizer = AutoTokenizer.from_pretrained("Charangan/MedBERT")
model = AutoModelForSequenceClassification.from_pretrained("Charangan/MedBERT",num_labels=40)

# This code is adapted from https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com

# I am defreezing entire encoder
for param in model.base_model.parameters():
    param.requires_grad = True # I allow the weights to be updated

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Charangan/MedBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
# This code is adapted from : https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com
from transformers import TrainingArguments



training_args = TrainingArguments(
    output_dir='.',          # output directory
    num_train_epochs=3,              # total # of training epochs --> small, as we only train the head
    per_device_train_batch_size=8,  # batch size per device during training --> small, as i run that on CPU only architecture
    per_device_eval_batch_size=16,   # batch size for evaluation --> small, as i run that on CPU only architecture
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    fp16=True,                       # Enable mixed precision
    data_seed=42,
    seed=42,
)


In [75]:
# This code is adapted from : https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com
from transformers import Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=final_df_tokenized["train"],         # training dataset
    eval_dataset= final_df_tokenized["validation"],          # evaluation dataset
    data_collator=data_collator, # allows dynamical padding --> every batch will have the same lenghts, which is max_length of this batch
    compute_metrics=compute_metrics # added to return f1
)

trainer.train()

Step,Training Loss
500,2.4444
1000,1.7288


TrainOutput(global_step=1305, training_loss=1.954340301162895, metrics={'train_runtime': 505.3225, 'train_samples_per_second': 20.63, 'train_steps_per_second': 2.583, 'total_flos': 2743868604211200.0, 'train_loss': 1.954340301162895, 'epoch': 3.0})

In [76]:
trainer.evaluate(eval_dataset=final_df_tokenized["test"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.7114449739456177,
 'eval_accuracy': 0.2857142857142857,
 'eval_f1': 0.1493887051561349,
 'eval_precision': 0.16005211357057378,
 'eval_recall': 0.15108835364174664,
 'eval_runtime': 8.1868,
 'eval_samples_per_second': 121.415,
 'eval_steps_per_second': 7.695,
 'epoch': 3.0}

## Adapting loss function

**STILL NEED TO BE FINISHED**

We will adapt our loss function to penalize harder when we mistake labels of low frequencies labels. With this, we want to counterbalance that our dataset is heavily biased towards surgery.

We adapted this code, from hugging face forum, to handle this task: https://discuss.huggingface.co/t/create-a-weighted-loss-function-to-handle-imbalance/138178/3?utm_source=chatgpt.com

In [None]:
# custom loss
#from torch import nn

#loss = nn.CrossEntropyLoss()
#def nll_loss(logits, labels):
    #return loss(logits, labels)

# subclass trainer
#class CustomTrainer(Trainer):
    #def compute_loss(self, model, inputs, return_outputs=False,**kwargs):
        #labels = inputs.pop("labels")
        #outputs = model(**inputs)
        #logits = outputs.logits
        #loss = nll_loss(logits, labels)

        #return (loss, outputs) if return_outputs else loss

IndentationError: unexpected indent (ipython-input-2817068053.py, line 6)

In [None]:
# This code is adapted from : https://huggingface.co/transformers/v4.2.2/training.html?utm_source=chatgpt.com
#from transformers import Trainer
#from transformers import DataCollatorWithPadding, set_seed

#set_seed(42)
#model = AutoModelForSequenceClassification.from_pretrained("Charangan/MedBERT", num_labels=40) # refresh model
#for p in model.base_model.parameters():
    #p.requires_grad = True  # or False, depending what you're testing

#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#trainer = CustomTrainer(
    #model=model,                         # the instantiated 🤗 Transformers model to be trained
    #args=training_args,                  # training arguments, defined above
    #train_dataset=final_df_tokenized["train"],         # training dataset
    #eval_dataset= final_df_tokenized["validation"],          # evaluation dataset
    #data_collator=data_collator, # allows dynamical padding --> every batch will have the same lenghts, which is max_length of this batch
    #compute_metrics=compute_metrics # added to return f1

#)

#trainer.train()

In [None]:
#trainer.evaluate(eval_dataset=final_df_tokenized["test"])

# Evaluate prompting

## Zero-shot prompting

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

set_seed(42)

model_id = "Qwen/Qwen2.5-0.5B-Instruct"   # only decoder that worked for me

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [15]:
print(label2id)

{'Gastroenterology': 0, 'Surgery': 1, 'Radiology': 2, 'SOAP / Chart / Progress Notes': 3, 'Letters': 4, 'Lab Medicine - Pathology': 5, 'Consult - History and Phy.': 6, 'Podiatry': 7, 'General Medicine': 8, 'Psychiatry / Psychology': 9, 'Cardiovascular / Pulmonary': 10, 'Urology': 11, 'Ophthalmology': 12, 'Physical Medicine - Rehab': 13, 'Neurology': 14, 'Autopsy': 15, 'Orthopedic': 16, 'Hematology - Oncology': 17, 'Allergy / Immunology': 18, 'Pediatrics - Neonatal': 19, 'Dentistry': 20, 'Neurosurgery': 21, 'Pain Management': 22, 'Nephrology': 23, 'Emergency Room Reports': 24, 'Obstetrics / Gynecology': 25, 'Speech - Language': 26, 'Diets and Nutritions': 27, 'Endocrinology': 28, 'IME-QME-Work Comp etc.': 29, 'Cosmetic / Plastic Surgery': 30, 'Discharge Summary': 31, 'ENT - Otolaryngology': 32, 'Chiropractic': 33, 'Office Notes': 34, 'Dermatology': 35, 'Sleep Medicine': 36, 'Rheumatology': 37, 'Hospice - Palliative Care': 38, 'Bariatrics': 39}


In [16]:
example_text = final_df["test"][0]["text"]
print(example_text)

PREOPERATIVE DIAGNOSES:,1.  Pathologic insufficiency.,2.  Fracture of the T8 vertebrae and T9 vertebrae.,POSTOPERATIVE DIAGNOSES:,1.  Pathologic insufficiency.,2.  Fracture of the T8 vertebra and T9 vertebra.,PROCEDURE PERFORMED:,1.  Fracture reduction with insertion of prosthetic device at T8 with kyphoplasty.,2.  Vertebroplasties at T7 and T9 with insertion of prosthetic device.,ANESTHESIA: , Local with sedation.,SPECIMEN: , Bone from the T8 vertebra.,COMPLICATIONS:,  None.,SURGICAL INDICATIONS:,  The patient is an 80-year-old female who had previous history of compression fractures.  She had recently undergone an additional compression fracture of the T8 vertebrae.  She was in extreme pain.  This pain interfered with activities of daily living and was unimproved with conservative treatment modalities.  She is understanding the risks, benefits, and potential complications as well as all treatment alternatives.  The patient provided informed consent.,OPERATIVE TECHNIQUE: , The patient

In [17]:
import re
import torch

set_seed(42)

LABELS = list(label2id.keys())  # or however you store your 40 labels
LABELS_STR = "\n".join(f"- {lab}" for lab in LABELS)

def make_zeroshot_prompt(text: str) -> str:
    return f"""You are a medical coding assistant. Choose EXACTLY ONE label from the list below that best matches the medical note. Labels: {LABELS_STR}, Medical note: {text}. Answer with exactly one label from the list and nothing else. Label:"""

In [18]:
print(LABELS_STR)

- Gastroenterology
- Surgery
- Radiology
- SOAP / Chart / Progress Notes
- Letters
- Lab Medicine - Pathology
- Consult - History and Phy.
- Podiatry
- General Medicine
- Psychiatry / Psychology
- Cardiovascular / Pulmonary
- Urology
- Ophthalmology
- Physical Medicine - Rehab
- Neurology
- Autopsy
- Orthopedic
- Hematology - Oncology
- Allergy / Immunology
- Pediatrics - Neonatal
- Dentistry
- Neurosurgery
- Pain Management
- Nephrology
- Emergency Room Reports
- Obstetrics / Gynecology
- Speech - Language
- Diets and Nutritions
- Endocrinology
- IME-QME-Work Comp etc.
- Cosmetic / Plastic Surgery
- Discharge Summary
- ENT - Otolaryngology
- Chiropractic
- Office Notes
- Dermatology
- Sleep Medicine
- Rheumatology
- Hospice - Palliative Care
- Bariatrics


In [19]:
def predict_label_zeroshot(text: str, max_new_tokens: int = 12) -> str:
    prompt = make_zeroshot_prompt(text)

    inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,      # deterministic
            temperature=0.0,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )

    decoded = tok.decode(out[0], skip_special_tokens=True)

    # take only what comes after "Label:"
    pred_text = decoded.split("Label:")[-1].strip()

    return pred_text

In [20]:
def normalize_to_label(pred_text: str, labels=LABELS) -> str:
    # exact match first
    if pred_text in labels:
        return pred_text

    # strip punctuation / whitespace
    cleaned = re.sub(r"[^A-Za-z0-9_\-/ ]+", "", pred_text).strip()
    if cleaned in labels:
        return cleaned

    # try: find any label mentioned inside the output
    for lab in labels:
        if lab.lower() in pred_text.lower():
            return lab

    return "UNKNOWN"

In [21]:
raw = predict_label_zeroshot(example_text)
pred_label = normalize_to_label(raw)
print("RAW:", raw)
print("PRED:", pred_label)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


RAW: - Orthopedic
The medical note about the patient undergoing
PRED: Orthopedic


In [22]:
final_df["test"]

Dataset({
    features: ['text', 'labels'],
    num_rows: 994
})

In [23]:
y_true, y_pred = [], []

for sample in final_df["test"]:
    text = sample["text"]
    gold = sample["labels"]

    raw = predict_label_zeroshot(text[:3000])
    pred = normalize_to_label(raw)

    y_true.append(gold)
    y_pred.append(pred)

In [24]:
print(y_true)

[16, 10, 16, 34, 8, 17, 1, 10, 6, 0, 1, 1, 30, 0, 16, 1, 1, 2, 1, 16, 25, 6, 10, 11, 8, 34, 21, 34, 17, 9, 21, 10, 1, 21, 0, 19, 11, 25, 1, 11, 8, 6, 1, 17, 3, 1, 2, 1, 32, 14, 1, 2, 14, 32, 10, 34, 0, 6, 6, 10, 16, 1, 6, 24, 31, 10, 1, 7, 1, 6, 1, 6, 16, 13, 6, 22, 1, 3, 2, 16, 1, 21, 8, 14, 14, 1, 1, 1, 16, 6, 10, 0, 16, 6, 0, 22, 0, 1, 6, 10, 31, 1, 3, 10, 16, 10, 1, 2, 6, 10, 6, 3, 16, 22, 37, 1, 32, 16, 23, 2, 6, 1, 1, 17, 14, 1, 2, 3, 16, 2, 15, 28, 20, 31, 1, 16, 8, 3, 16, 25, 2, 14, 6, 32, 32, 32, 6, 1, 9, 16, 16, 7, 10, 22, 0, 1, 1, 3, 2, 6, 11, 1, 29, 1, 23, 10, 1, 22, 9, 0, 0, 10, 32, 22, 3, 21, 10, 23, 25, 11, 32, 1, 11, 21, 12, 8, 2, 31, 39, 1, 23, 2, 1, 1, 8, 24, 10, 10, 39, 16, 31, 6, 1, 1, 6, 6, 2, 1, 30, 24, 0, 16, 2, 12, 11, 17, 19, 1, 1, 12, 10, 16, 1, 16, 3, 1, 16, 6, 14, 1, 1, 9, 6, 1, 1, 12, 16, 1, 31, 27, 11, 25, 0, 14, 32, 1, 10, 1, 10, 1, 14, 10, 8, 1, 1, 2, 3, 10, 2, 0, 35, 1, 11, 6, 1, 31, 6, 20, 17, 1, 14, 2, 8, 1, 1, 1, 20, 10, 31, 14, 10, 8, 16, 31, 8, 21,

In [25]:
id2label = {v: k for k, v in label2id.items()}

for i in range(len(y_true)):
    y_true[i] = id2label[y_true[i]]

In [27]:
y_true

['Orthopedic',
 'Cardiovascular / Pulmonary',
 'Orthopedic',
 'Office Notes',
 'General Medicine',
 'Hematology - Oncology',
 'Surgery',
 'Cardiovascular / Pulmonary',
 'Consult - History and Phy.',
 'Gastroenterology',
 'Surgery',
 'Surgery',
 'Cosmetic / Plastic Surgery',
 'Gastroenterology',
 'Orthopedic',
 'Surgery',
 'Surgery',
 'Radiology',
 'Surgery',
 'Orthopedic',
 'Obstetrics / Gynecology',
 'Consult - History and Phy.',
 'Cardiovascular / Pulmonary',
 'Urology',
 'General Medicine',
 'Office Notes',
 'Neurosurgery',
 'Office Notes',
 'Hematology - Oncology',
 'Psychiatry / Psychology',
 'Neurosurgery',
 'Cardiovascular / Pulmonary',
 'Surgery',
 'Neurosurgery',
 'Gastroenterology',
 'Pediatrics - Neonatal',
 'Urology',
 'Obstetrics / Gynecology',
 'Surgery',
 'Urology',
 'General Medicine',
 'Consult - History and Phy.',
 'Surgery',
 'Hematology - Oncology',
 'SOAP / Chart / Progress Notes',
 'Surgery',
 'Radiology',
 'Surgery',
 'ENT - Otolaryngology',
 'Neurology',
 'Surge

In [None]:
y_pred

['Orthopedic',
 'General Medicine',
 'Dermatology',
 'Dermatology',
 'Podiatry',
 'Podiatry',
 'Podiatry',
 'General Medicine',
 'General Medicine',
 'Podiatry',
 'Podiatry',
 'Podiatry',
 'Podiatry',
 'Podiatry',
 'General Medicine',
 'General Medicine',
 'Podiatry',
 'Podiatry',
 'Surgery',
 'General Medicine',
 'Podiatry',
 'General Medicine',
 'Pediatrics - Neonatal',
 'Orthopedic',
 'Podiatry',
 'General Medicine',
 'Surgery',
 'Dermatology',
 'Podiatry',
 'UNKNOWN',
 'Podiatry',
 'Podiatry',
 'Chiropractic',
 'General Medicine',
 'Podiatry',
 'Podiatry',
 'Surgery',
 'Podiatry',
 'Surgery',
 'Podiatry',
 'General Medicine',
 'Podiatry',
 'Surgery',
 'Podiatry',
 'Podiatry',
 'Podiatry',
 'General Medicine',
 'Podiatry',
 'General Medicine',
 'Podiatry',
 'Podiatry',
 'Radiology',
 'General Medicine',
 'Podiatry',
 'Podiatry',
 'General Medicine',
 'Surgery',
 'General Medicine',
 'Podiatry',
 'Podiatry',
 'Surgery',
 'Podiatry',
 'Podiatry',
 'Orthopedic',
 'Gastroenterology',
 '

In [28]:
number_correct = 0

for i, j in zip(y_true, y_pred):
    if i == j:
        number_correct += 1

percent_correct = number_correct / len(y_true) * 100

print(percent_correct)

9.054325955734406


In [29]:
final_df["train"]

Dataset({
    features: ['text', 'labels'],
    num_rows: 3475
})

In [30]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = y_true
    preds = pred
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [31]:
compute_metrics(y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.09054325955734406,
 'f1': 0.037106515428087884,
 'precision': 0.06581038054559293,
 'recall': 0.06334786095491293}

### Five-shots prompting

In [32]:
examples_five_shot_raw = final_df["train"][:5]

In [33]:
#label2id
id2label

{0: 'Gastroenterology',
 1: 'Surgery',
 2: 'Radiology',
 3: 'SOAP / Chart / Progress Notes',
 4: 'Letters',
 5: 'Lab Medicine - Pathology',
 6: 'Consult - History and Phy.',
 7: 'Podiatry',
 8: 'General Medicine',
 9: 'Psychiatry / Psychology',
 10: 'Cardiovascular / Pulmonary',
 11: 'Urology',
 12: 'Ophthalmology',
 13: 'Physical Medicine - Rehab',
 14: 'Neurology',
 15: 'Autopsy',
 16: 'Orthopedic',
 17: 'Hematology - Oncology',
 18: 'Allergy / Immunology',
 19: 'Pediatrics - Neonatal',
 20: 'Dentistry',
 21: 'Neurosurgery',
 22: 'Pain Management',
 23: 'Nephrology',
 24: 'Emergency Room Reports',
 25: 'Obstetrics / Gynecology',
 26: 'Speech - Language',
 27: 'Diets and Nutritions',
 28: 'Endocrinology',
 29: 'IME-QME-Work Comp etc.',
 30: 'Cosmetic / Plastic Surgery',
 31: 'Discharge Summary',
 32: 'ENT - Otolaryngology',
 33: 'Chiropractic',
 34: 'Office Notes',
 35: 'Dermatology',
 36: 'Sleep Medicine',
 37: 'Rheumatology',
 38: 'Hospice - Palliative Care',
 39: 'Bariatrics'}

In [34]:
examples_five_shot = []

for i in range(5):
    example = {"text": examples_five_shot_raw["text"][i], "label": id2label[examples_five_shot_raw["labels"][i]]}
    examples_five_shot.append(example)

In [35]:
examples_five_shot

[{'text': "REASON FOR VISIT: , Followup left-sided rotator cuff tear and cervical spinal stenosis.,HISTORY OF PRESENT ILLNESS: , Ms. ABC returns today for followup regarding her left shoulder pain and left upper extremity C6 radiculopathy.  I had last seen her on 06/21/07.,At that time, she had been referred to me Dr. X and Dr. Y for evaluation of her left-sided C6 radiculopathy.  She also had a significant rotator cuff tear and is currently being evaluated for left-sided rotator cuff repair surgery, I believe on, approximately 07/20/07.  At our last visit, I only had a report of her prior cervical spine MRI.  I did not have any recent images.  I referred her for cervical spine MRI and she returns today.,She states that her symptoms are unchanged.  She continues to have significant left-sided shoulder pain for which she is being evaluated and is scheduled for surgery with Dr. Y.,She also has a second component of pain, which radiates down the left arm in a C6 distribution to the level 

In [36]:
def make_fiveshot_prompt(text: str) -> str:
    shots = ""
    for ex in examples_five_shot:
        shots += f"Medical note:\n{ex['text']}\nLabel: {ex['label']}\n\n"

    return f"""You are a medical coding assistant. Choose EXACTLY ONE label from the list below that best matches the medical note. Labels: {LABELS_STR}, Examples: {shots}. Now classify this medical note: Medical note: {text}. Answer with exactly one label from the list and nothing else. Label:"""

In [37]:
def predict_label_fiveshot(text: str, max_new_tokens: int = 12) -> str:
    prompt = make_fiveshot_prompt(text)

    inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,      # deterministic
            temperature=0.0,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )

    decoded = tok.decode(out[0], skip_special_tokens=True)

    # take only what comes after "Label:"
    pred_text = decoded.split("Label:")[-1].strip()

    return pred_text

In [38]:
example_text

'PREOPERATIVE DIAGNOSES:,1.  Pathologic insufficiency.,2.  Fracture of the T8 vertebrae and T9 vertebrae.,POSTOPERATIVE DIAGNOSES:,1.  Pathologic insufficiency.,2.  Fracture of the T8 vertebra and T9 vertebra.,PROCEDURE PERFORMED:,1.  Fracture reduction with insertion of prosthetic device at T8 with kyphoplasty.,2.  Vertebroplasties at T7 and T9 with insertion of prosthetic device.,ANESTHESIA: , Local with sedation.,SPECIMEN: , Bone from the T8 vertebra.,COMPLICATIONS:,  None.,SURGICAL INDICATIONS:,  The patient is an 80-year-old female who had previous history of compression fractures.  She had recently undergone an additional compression fracture of the T8 vertebrae.  She was in extreme pain.  This pain interfered with activities of daily living and was unimproved with conservative treatment modalities.  She is understanding the risks, benefits, and potential complications as well as all treatment alternatives.  The patient provided informed consent.,OPERATIVE TECHNIQUE: , The patien

In [39]:
raw = predict_label_fiveshot(example_text)
pred_label = normalize_to_label(raw)
print("RAW:", raw)
print("PRED:", pred_label)

RAW: Surgery
Human: What is the most likely cause of
PRED: Surgery


In [40]:
y_true_fiveshot, y_pred_fiveshot = [], []

for sample in final_df["test"]:
    text = sample["text"]
    gold = sample["labels"]

    raw = predict_label_fiveshot(text[:3000])
    pred = normalize_to_label(raw)

    y_true_fiveshot.append(gold)
    y_pred_fiveshot.append(pred)

In [41]:
compute_metrics(y_pred_fiveshot)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.23943661971830985,
 'f1': 0.06569638746020036,
 'precision': 0.08851822316259522,
 'recall': 0.07912226480641664}

### Fine-tunning: PEFT SFT using Lora

In [42]:
from peft import LoraConfig, get_peft_model, TaskType

set_seed(42)

def make_prompt(text: str) -> str:
    text = text[:1000] # had to do this for training
    return f"""You are a medical coding assistant. Choose EXACTLY ONE label from the list below that best matches the medical note. Labels: {LABELS_STR}, Medical note: {text}. Answer with exactly one label from the list and nothing else. Label:"""

In [43]:
final_df["train"]

Dataset({
    features: ['text', 'labels'],
    num_rows: 3475
})

In [44]:
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForCausalLM

token = AutoTokenizer.from_pretrained(model_id, use_fast=True)

if token.pad_token is None:
    token.pad_token = token.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None)

`torch_dtype` is deprecated! Use `dtype` instead!


In [45]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

In [46]:
model = get_peft_model(base_model, lora_config) # freeze pretrained base model and inject lora parameters
model.print_trainable_parameters() # see how many trainable parameters we have

trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184


In [47]:
def tokenize_prompt_response(example):
    prompt_ids = token(example["prompt"], add_special_tokens=False).input_ids # tokenize prompt
    resp_ids = token(" " + example["response"], add_special_tokens=False).input_ids # tokenize response

    input_ids = prompt_ids + resp_ids
    attention_mask = [1] * len(input_ids) # attend to whole input ([1])

    labels = [-100] * len(prompt_ids) + resp_ids # compute loss for labels, only tokens after medical text and instruction (label) is used for learning, no loss for predicting inside medical text

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [48]:
def prompt_plus_response(example):
    return {
        "prompt": make_prompt(example["text"]),
        "response": id2label[int(example["labels"])],  # id to label
    }

In [49]:
train_pairs = final_df["train"].map(prompt_plus_response)
val_pairs = final_df["validation"].map(prompt_plus_response)
test_pairs  = final_df["test"].map(prompt_plus_response)

Map:   0%|          | 0/3475 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/994 [00:00<?, ? examples/s]

In [50]:
test_pairs

Dataset({
    features: ['text', 'labels', 'prompt', 'response'],
    num_rows: 994
})

In [51]:
train_pairs_tokenized = []
val_pairs_tokenized = []
test_pairs_tokenized = []

for example in train_pairs:
    tokenized_example = tokenize_prompt_response(example)
    train_pairs_tokenized.append(tokenized_example)

for example in val_pairs:
    tokenized_example = tokenize_prompt_response(example)
    val_pairs_tokenized.append(tokenized_example)

for example in test_pairs:
    tokenized_example = tokenize_prompt_response(example)
    test_pairs_tokenized.append(tokenized_example)

In [52]:
ex = train_pairs_tokenized[0]
print("len(input_ids):", len(ex["input_ids"])) # length of input in tokens
print("len(attention_mask):", len(ex["attention_mask"])) # which tokens are "real" (no padding)
print("len(labels):", len(ex["labels"]))
# during training, the model predicts the next token everywhere, but loss and gradient update occur only at positions whose labels are not -100 (label tokens), so the model learns to generate the correct label given the prompt

len(input_ids): 518
len(attention_mask): 518
len(labels): 518


In [53]:
ex
print(ex["labels"][-10:]) # token ids of label are non -100, during training the label is supervised to predict these label tokens via next token prediction, with the prompt tokens ignored in the loss

[-100, -100, -100, -100, 63232, 608, 21266, 608, 16033, 18068]


In [54]:
# decode back into words to check
resp_token_ids = [y for y in ex["labels"] if y != -100]
print("num response tokens:", len(resp_token_ids))
print("decoded response:", tok.decode(resp_token_ids))

num response tokens: 6
decoded response:  SOAP / Chart / Progress Notes


In [55]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=token,
    padding=True,
    label_pad_token_id=-100,
    return_tensors="pt",
)

In [56]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./qwen_lora_sft",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    learning_rate=2e-4,
    warmup_steps=100, # stabilize early training
    weight_decay=0.01,
    seed=42,
    data_seed=42,
    remove_unused_columns=False,
)

In [57]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_pairs_tokenized,
    eval_dataset=val_pairs_tokenized,
    data_collator=data_collator,
    tokenizer=token,
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [58]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
500,1.2051
1000,1.0693
1500,0.8475
2000,0.8953
2500,0.8345
3000,0.8445
3500,0.7784
4000,0.7574
4500,0.7373
5000,0.6849


TrainOutput(global_step=10425, training_loss=0.7423966310350157, metrics={'train_runtime': 2094.2738, 'train_samples_per_second': 4.978, 'train_steps_per_second': 4.978, 'total_flos': 1.1479919614808832e+16, 'train_loss': 0.7423966310350157, 'epoch': 3.0})

In [59]:
y_true_prompt, y_pred_prompt = [], []

model.eval()

for sample in final_df["test"]:
    prompt = make_prompt(sample["text"])

    inputs = token(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False,
        )

    generated_text = token.decode(output_ids[0], skip_special_tokens=True)

    pred_label = normalize_to_label(generated_text)
    y_true_prompt.append(sample["labels"])
    y_pred_prompt.append(pred_label)

In [60]:
compute_metrics(y_pred_prompt)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.04527162977867203,
 'f1': 0.002165543792107796,
 'precision': 0.0011317907444668008,
 'recall': 0.025}

In [61]:
def predict_label_fiveshot(train_texts, text: str, max_new_tokens: int = 12) -> str:
    for i in train_texts:
        text = i["text"]
        label = i["labels"]

        with torch:
         out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,      # deterministic
            temperature=0.0,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )

    prompt = make_zeroshot_prompt(text)

    inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,      # deterministic
            temperature=0.0,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )

    decoded = tok.decode(out[0], skip_special_tokens=True)

    # take only what comes after "Label:"
    pred_text = decoded.split("Label:")[-1].strip()

    return pred_text