# Domain‑Adaptive Pretraining (DAPT)
This notebook demonstrates how to continue Masked Language Model pretraining for DeBERTa‑v3‑large on an unlabeled news corpus.

> Reference: Gururangan et al. (2020), 'Don’t Stop Pretraining: Adapt Language Models to Domains and Tasks.'

In [None]:
# Install dependencies (first run only)
!pip install -q transformers datasets torch accelerate peft evaluate pandas numpy matplotlib

In [None]:
# (Optional) Mount Google Drive to save/load checkpoints
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import os
import math
import torch
from src.pretrain_lm import run_dapt
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset

# Configuration parameters
MODEL_NAME    = 'microsoft/deberta-v3-large'   # Base MLM model
DATA_FILE     = 'data/external/unlabeled.txt'  # Unlabeled news corpus
OUTPUT_DIR    = 'outputs/dapt_checkpoints/'    # Save adapted model here
NUM_EPOCHS    = 5                              # Number of DAPT epochs
BATCH_SIZE    = 8                              # Batch size for DAPT
LEARNING_RATE = 5e-5                           # Learning rate for DAPT
BLOCK_SIZE    = 512                            # Sequence length for grouping
MLM_PROB      = 0.15                           # Masking probability
DEVICE        = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Confirm data file exists
assert os.path.exists(DATA_FILE), f"Data file not found: {DATA_FILE}"os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Run Domain‑Adaptive Pretraining
run_dapt(
    model_name=MODEL_NAME,
    data_file=DATA_FILE,
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
)

In [None]:
# 2. Evaluate Perplexity of Adapted Model
import pandas as pd
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
# Reload adapted model/tokenizer
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(OUTPUT_DIR).to(DEVICE)
# Load raw dataset again to avoid mutation
raw_ds = load_dataset('text', data_files={'train': DATA_FILE})['train']
# Tokenization & grouping function
def tokenize_and_group(examples):
    tok = tokenizer(
        examples['text'],
        return_special_tokens_mask=True,
        truncation=True,
        padding='max_length',
        max_length=BLOCK_SIZE
    )
    all_ids = tok['input_ids']
    # Group into blocks of size BLOCK_SIZE
    return {'input_ids': [all_ids[i:i+BLOCK_SIZE] for i in range(0, len(all_ids), BLOCK_SIZE)]}
blocks = raw_ds.map(tokenize_and_group, batched=True, remove_columns=['text'])
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=MLM_PROB)
loader = DataLoader(blocks, batch_size=BATCH_SIZE, collate_fn=collator)
# Compute perplexity
model.eval()
total_loss = 0.0
for batch in loader:
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    total_loss += outputs.loss.item()
avg_loss = total_loss / len(loader)
print(f'Average Loss: {avg_loss:.4f}
Perplexity: {math.exp(avg_loss):.2f}')