<a href="https://colab.research.google.com/github/QiaoLin22/MASTER-LLM-DL/blob/main/ModernBERT_Large_LLMRouter_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Finetuning ModernBERT Large for LLM Router Classification

In [5]:
%%capture
# Install Pytorch & other libraries
%pip install "torch==2.4.1" tensorboard
%pip install flash-attn "setuptools<71.0.0" scikit-learn
%pip install --upgrade torchvision

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.1.0" \
  "accelerate==1.2.1" \
  "hf-transfer==0.1.8"

# ModernBERT is not yet available in an official transformers release, so we need to install it from github
%pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

In [1]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_TOKEN'), add_to_git_credential=True)

In [2]:
from datasets import load_dataset

# Dataset id from huggingface.co/dataset
dataset_id = "DevQuasar/llm_router_dataset-synth"

# Load raw dataset
raw_dataset = load_dataset(dataset_id)

# Split into our Test & Train sets
train_dataset = raw_dataset['train']
test_dataset = raw_dataset['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 15306
Test dataset size: 4921


In [3]:
train_dataset = train_dataset.remove_columns(["id"])
train_dataset = train_dataset.rename_column("prompt", "text")
train_dataset = train_dataset.rename_column("label", "labels")

test_dataset = test_dataset.remove_columns(["id"])
test_dataset = test_dataset.rename_column("prompt", "text")
test_dataset = test_dataset.rename_column("label", "labels")

In [4]:
test_dataset[:3]

{'text': ['What were the key factors that influenced the development of Medieval Gothic architecture in Europe?',
  'How did the discovery of gravitational waves impact our understanding of cosmic inflation and the origins of the universe?',
  'Can you explain the concept of allosteric inhibition in enzyme kinetics, including its significance in pharmaceutical design?'],
 'labels': [1, 1, 1]}

In [5]:
from transformers import AutoTokenizer

# Model ID
model_id = "answerdotai/ModernBERT-large"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Notes that all inputs should be at max 1024 tokens.
# Text longer will be truncated, and text shorter will be padded with special tokens to maintain consistency.
tokenizer.model_max_length = 1024

# Tokenize helper function
# Take in a batch of text to tokenize, return back the tokenized text.
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt")

In [6]:
# Tokenize train dataset
tokenized_train_dataset = train_dataset.map(tokenize, batched=True,remove_columns=["text"])

# Tokenize test dataset
tokenized_test_dataset = test_dataset.map(tokenize, batched=True,remove_columns=["text"])

print(tokenized_train_dataset.features.keys())
print(tokenized_test_dataset.features.keys())

Map:   0%|          | 0/15306 [00:00<?, ? examples/s]

Map:   0%|          | 0/4921 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])
dict_keys(['labels', 'input_ids', 'attention_mask'])


In [7]:
label2id = {"small_llm": "0", "large_llm": "1"}
id2label = {"0": "small_llm", "1": "large_llm"}

In [8]:
from transformers import AutoModelForSequenceClassification
# %pip install "git+https://github.com/huggingface/transformers.git" --upgrade
# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-large"

# Prepare model labels - useful for inference
labels = tokenized_train_dataset.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
).to('cuda')

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
from sklearn.metrics import f1_score

# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}

In [10]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# Define our training configuration
training_args = TrainingArguments(
    output_dir= "ModernBERT-large-llm-router",  # Directory where model checkpoints will be saved
    per_device_train_batch_size=32,         # Number of samples processed at once during training
    per_device_eval_batch_size=16,          # Number of samples processed at once during evaluation
    learning_rate=5e-5,                     # How quickly the model updates its weights
    num_train_epochs=5,                     # Number of complete passes through the training data
    bf16=True,                              # Use bfloat16 for faster, memory-efficient training
    optim="adamw_torch_fused",              # Optimized version of AdamW optimizer for better performance

    # Configure how and when to log training progress
    logging_strategy="steps",
    logging_steps=100,                      # Log metrics every 100 training steps
    eval_strategy="epoch",                  # Evaluate model after each epoch
    save_strategy="epoch",                  # Save model after each epoch
    save_total_limit=2,                     # Only keep the 2 best model checkpoints
    load_best_model_at_end=True,            # Load the best model when training finishes
    metric_for_best_model="f1",             # Use F1 score to determine which model is best

    # HuggingFace Hub integration settings
    report_to="tensorboard",                # Log metrics to Tensorboard
    push_to_hub=True,                       # Upload model to HuggingFace Hub
    hub_strategy="every_save",              # Push to Hub whenever we save a checkpoint
    hub_token=HfFolder.get_token(),         # Authentication for HuggingFace Hub
)

# Create trainer with our model, data, and training configuration
trainer = Trainer(
    model=model,                            # Our BERT model with classification head
    args=training_args,                     # Training configuration we defined above
    train_dataset=tokenized_train_dataset,  # Our processed training data
    eval_dataset=tokenized_test_dataset,    # Our processed test data
    compute_metrics=compute_metrics,        # Our F1 score calculation function
)

In [11]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0292,0.027499,0.99051
2,0.0099,0.039781,0.990914
3,0.0011,0.070989,0.992327
4,0.0006,0.045149,0.992327
5,0.0006,0.047054,0.992327


TrainOutput(global_step=2395, training_loss=0.01571207176699519, metrics={'train_runtime': 522.7027, 'train_samples_per_second': 146.412, 'train_steps_per_second': 4.582, 'total_flos': 1.6186952304488448e+17, 'train_loss': 0.01571207176699519, 'epoch': 5.0})

In [12]:
# Save processor and create model card
tokenizer.save_pretrained("ModernBERT-large-llm-router")
trainer.create_model_card()
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/stardriver007/ModernBERT-large-llm-router/commit/f83824789c3ac3515f70e22b83d806e2e365d7c9', commit_message='End of training', commit_description='', oid='f83824789c3ac3515f70e22b83d806e2e365d7c9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/stardriver007/ModernBERT-large-llm-router', endpoint='https://huggingface.co', repo_type='model', repo_id='stardriver007/ModernBERT-large-llm-router'), pr_revision=None, pr_num=None)

In [13]:
# Note - Run above dependencies cell to install necessary packages

from transformers import pipeline

# load model from huggingface.co/models using our repository id
classifier = pipeline("text-classification", model="stardriver007/ModernBERT-large-llm-router", device=0)



config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Device set to use cuda:0


In [14]:
sample_1 = "What role does chromatin remodeling play in epigenetic regulation during embryonic development, particularly in cell fate determination and tissue specification?"
prediction_1 = classifier(sample_1)
print(prediction_1)



[{'label': 'large_llm', 'score': 1.0}]


In [15]:
sample_2 = "Why is the sky blue?"
prediction_2 = classifier(sample_2)
print(prediction_2)

[{'label': 'small_llm', 'score': 1.0}]
