<a href="https://colab.research.google.com/drive/your-notebook-name" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tune BioBERT on PubMed Abstracts

This notebook fine-tunes BioBERT (`allenai/biobert-v1.1-pubmed`) on PubMed abstracts using Masked Language Modeling (MLM) for biomedical NLP tasks, such as improving spellchecking in medical texts (e.g., correcting 'arbitysratsddion' to 'arteries'). The model is trained in Google Colab with GPU support and saved to Google Drive for local use in a spellchecker project.

**Setup**: Google Colab (GPU), Hugging Face `pubmed` dataset, PyTorch.
**Output**: Fine-tuned BioBERT model and tokenizer saved to `/content/drive/MyDrive/biobert_finetuned`.

In [1]:
!pip install transformers==4.28.0 datasets==2.10.0 torch==1.13.1 sentencepiece==0.1.97 numpy==1.24.2

# Verify installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 1.13.1+cu117
CUDA available: True
GPU: Tesla T4


In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Set output directory
output_dir = '/content/drive/MyDrive/biobert_finetuned'
import os
os.makedirs(output_dir, exist_ok=True)

Mounted at /content/drive


In [5]:
from datasets import load_dataset

# Load PubMed dataset (10% subset for Colab compatibility)
dataset = load_dataset('pubmed_abstracts', split='train[:10%]')

# Inspect dataset
print(dataset)
print(dataset[0]['Abstract']['AbstractText'] if 'Abstract' in dataset[0] else 'No abstract')

FileNotFoundError: Couldn't find a dataset script at /content/pubmed_abstracts/pubmed_abstracts.py or any data file in the same directory. Couldn't find 'pubmed_abstracts' on the Hugging Face Hub either: FileNotFoundError: Dataset 'pubmed_abstracts' doesn't exist on the Hub. If the repo is private or gated, make sure to log in with `huggingface-cli login`.

In [None]:
from transformers import BertTokenizer, DataCollatorForLanguageModeling

# Load BioBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('allenai/biobert_v1.1_pubmed')

# Preprocess function
def preprocess_function(examples):
    texts = [item['AbstractText'] for item in examples['Abstract'] if item['AbstractText']]
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    return encodings

# Apply preprocessing
encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from transformers import BertForMaskedLM, Trainer, TrainingArguments

# Load BioBERT model
model = BertForMaskedLM.from_pretrained('allenai/biobert_v1.1_pubmed')

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Reduced for Colab free tier
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{output_dir}/logs',
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    fp16=True if torch.cuda.is_available() else False,  # Mixed precision for GPU
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    data_collator=data_collator,
)

# Train
trainer.train()

# Save model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'Model saved to {output_dir}')

In [None]:
from transformers import pipeline

# Test the fine-tuned model
fill_mask = pipeline('fill-mask', model=output_dir, tokenizer=output_dir)
test_sentence = 'Hypertension is a [MASK] condition.'
results = fill_mask(test_sentence)
for result in results:
    print(f"Token: {result['token_str']}, Score: {result['score']:.4f}")

In [None]:
# Zip the model for download
!zip -r /content/biobert_finetuned.zip /content/drive/MyDrive/biobert_finetuned

from google.colab import files
files.download('/content/biobert_finetuned.zip')
print('Download the zip file to your local machine')