# BERT Pretraining on Historical Text Data

Step-by-step walkthrough of pretraining BERT on EEBO/ECCO/EVAN historical text data.

**Key features:**
- Load data from CSV with pandas
- Efficient preprocessing with Hugging Face Datasets (dynamic padding)
- Chunk text into 250-token sequences with date prefix + [TIME] token
- Apply masking to both date and text
- Train using Trainer API with validation monitoring

In [None]:
# Import Required Libraries

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import BertTokenizer, BertForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

Data path exists: True
Found 5012 XML files


In [None]:
# Load CSV Data

# Find CSV files in data directory
data_dir = Path('data')
csv_files = list(data_dir.glob('*.csv'))

print(f"Found {len(csv_files)} CSV file(s):")
for f in csv_files:
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  - {f.name} ({size_mb:.1f} MB)")

# Load and combine all CSV files
all_data = []
for csv_path in csv_files:
    print(f"\nLoading {csv_path.name}...")
    df = pd.read_csv(csv_path)
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    all_data.append(df)

# Combine
combined_df = pd.concat(all_data, ignore_index=True)
print(f"\nTotal combined rows: {len(combined_df)}")
print(f"\nFirst row sample:")
print(combined_df.iloc[0])

Testing parser on: N00001.p4.xml

Extracted 49 pages

First page sample:
    author                place             date                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           page_text
0  Unknown  [Cambridge, Mass. :  Imprinted 1640.  THE VVHOLE BOOKE OF PSALMES Faithfully TRANSLATED into ENGLISH Metre. Whereunto is prefixed a discourse de∣claring not only the lawfullnes, but also the necessity of the heavenly Ordinance of singing Scripture Psalmes in the Churches of God. Coll.  III. \n Let the word of God dwe

In [None]:
# Load Pre-trained Model and Tokenizer

model_name = 'bert-base-uncased'
print(f"Loading tokenizer from: {model_name}")
tokenizer = BertTokenizer.from_pretrained(model_name)

# Add custom [TIME] token
if "[TIME]" not in tokenizer.vocab:
    tokenizer.add_tokens(["[TIME]"])
    print("Added [TIME] token to vocabulary")

print(f"Vocabulary size: {len(tokenizer)}")
print(f"Sample tokenization:")

sample_text = "The quick brown fox jumps over the lazy dog"
tokens = tokenizer.tokenize(sample_text)
print(f"  Text: {sample_text}")
print(f"  Tokens: {tokens}")
print(f"  Token IDs: {tokenizer.convert_tokens_to_ids(tokens)}")

In [None]:
# Create Hugging Face Dataset and Apply Preprocessing

def tokenize_and_chunk_function(examples, tokenizer, max_chunk_length=250):
    """
    Efficiently tokenize and chunk text.
    
    Called with batched=True for speed.
    Format: [CLS] <date> [TIME] <text_chunk> [SEP]
    """
    batch_size = len(examples['date'])
    all_input_ids = []
    all_attention_masks = []
    
    for idx in range(batch_size):
        date = examples['date'][idx]
        text = examples['page_text'][idx]
        
        if not text or pd.isna(text) or pd.isna(date):
            continue
        
        date_str = str(date).strip()
        text_str = str(text).strip()
        
        # Tokenize date and text
        date_tokens = tokenizer.tokenize(date_str)
        text_tokens = tokenizer.tokenize(text_str)
        
        # Create chunks of text with date prefix
        for chunk_start in range(0, len(text_tokens), max_chunk_length):
            chunk_end = min(chunk_start + max_chunk_length, len(text_tokens))
            chunk_tokens = text_tokens[chunk_start:chunk_end]
            
            # Build sequence: [CLS] <date> [TIME] <text_chunk> [SEP]
            tokens = [tokenizer.cls_token]
            tokens.extend(date_tokens)
            tokens.append("[TIME]")
            tokens.extend(chunk_tokens)
            tokens.append(tokenizer.sep_token)
            
            # Convert to IDs
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            attention_mask = [1] * len(input_ids)
            
            if len(input_ids) <= 512:
                all_input_ids.append(input_ids)
                all_attention_masks.append(attention_mask)
    
    return {
        'input_ids': all_input_ids if all_input_ids else [],
        'attention_mask': all_attention_masks if all_attention_masks else [],
    }

# Convert to HF Dataset and apply tokenization
print("Converting to Hugging Face Dataset...")
dataset = Dataset.from_pandas(combined_df)
print(f"Dataset size: {len(dataset)}")

print("\nApplying tokenization and chunking with batched=True...")
tokenized_dataset = dataset.map(
    lambda examples: tokenize_and_chunk_function(examples, tokenizer, max_chunk_length=250),
    batched=True,
    batch_size=100,  # Process 100 examples at a time
    remove_columns=['author', 'place', 'date', 'page_text'],
    num_proc=2,  # Use 2 processes
)

print(f"Tokenized dataset size: {len(tokenized_dataset)}")
print(f"\nExample sample:")
example = tokenized_dataset[0]
print(f"  Input IDs length: {len(example['input_ids'])}")
print(f"  Attention mask length: {len(example['attention_mask'])}")

Processing 5012 files...


100%|██████████| 5012/5012 [11:01<00:00,  7.58it/s]  



Total pages extracted: 161599

DataFrame info:
<class 'pandas.DataFrame'>
RangeIndex: 161599 entries, 0 to 161598
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   author     161599 non-null  str  
 1   place      161599 non-null  str  
 2   date       161599 non-null  str  
 3   page_text  161599 non-null  str  
dtypes: str(4)
memory usage: 4.9 MB
None

First few rows:
    author                place             date  \
0  Unknown  [Cambridge, Mass. :  Imprinted 1640.   
1  Unknown  [Cambridge, Mass. :  Imprinted 1640.   
2  Unknown  [Cambridge, Mass. :  Imprinted 1640.   
3  Unknown  [Cambridge, Mass. :  Imprinted 1640.   
4  Unknown  [Cambridge, Mass. :  Imprinted 1640.   

                                           page_text  
0  THE VVHOLE BOOKE OF PSALMES Faithfully TRANSLA...  
1  The Preface. THe singing of Psalmes, though it...  
2              chron Reu. Reu. Num. Reu. Gal. chron.  
3                           

In [None]:
# Split Data and Set Up Training

# Train/validation split (90/10)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Load the actual model
print(f"\nLoading model: {model_name}")
model = BertForMaskedLM.from_pretrained(model_name)

# Resize embeddings for [TIME] token
model.resize_token_embeddings(len(tokenizer))
print(f"Model resized for new vocabulary size: {len(tokenizer)}")
print(f"Model device: {next(model.parameters()).device}")

# Data collator with dynamic padding (key for efficiency!)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,  # Mask 15% of tokens
)

print("\nData collator ready for dynamic padding!")

In [None]:
# Configure Training Arguments

training_args = TrainingArguments(
    output_dir='output/bert_pretrained',
    overwrite_output_dir=True,
    
    # Training parameters
    num_train_epochs=1,  # Start with 1 epoch for testing
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    
    # Learning and optimization
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    
    # Logging and evaluation
    logging_steps=100,
    evaluation_strategy="epoch",  # Evaluate once per epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    
    # Hardware
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

print("Training configuration:")
print(f"  Output directory: {training_args.output_dir}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision: {training_args.fp16}")

In [None]:
# Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("Trainer initialized and ready for training!")
print(f"Number of training steps per epoch: {len(train_dataset) // training_args.per_device_train_batch_size}")
print(f"Total training steps: {len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs}")

In [None]:
# Train the Model

print("Starting training...")
print("This may take a while depending on dataset size and hardware")
print()

train_result = trainer.train()

print("\nTraining completed!")
print(f"Final training loss: {train_result.training_loss:.4f}")

In [None]:
# Evaluate Model Performance

print("Evaluating model on validation data...")
eval_result = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_result.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")
    else:
        print(f"  {key}: {value}")

In [None]:
# Save and Load the Fine-tuned Model

# Save the model
output_dir = 'output/bert_pretrained'
print(f"Saving model to {output_dir}...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model and tokenizer saved!")
print(f"\nSaved files:")
for file in Path(output_dir).glob('*'):
    print(f"  - {file.name}")

# Load the model back for inference
print("\nLoading model back for inference...")
loaded_model = BertForMaskedLM.from_pretrained(output_dir)
loaded_tokenizer = BertTokenizer.from_pretrained(output_dir)

print("Model loaded successfully!")

In [None]:
# Test Inference with Masked Language Modeling

from torch.nn.functional import softmax

# Test sentence with masks
test_sentence = "[TIME] 1633 the quick brown fox jumps [MASK] the lazy dog"
print(f"Test sentence: {test_sentence}")

# Tokenize
encoded = loaded_tokenizer(test_sentence, return_tensors='pt')
print(f"\nTokenized: {loaded_tokenizer.convert_ids_to_tokens(encoded['input_ids'][0].tolist())}")

# Get predictions
with torch.no_grad():
    outputs = loaded_model(**encoded)
    predictions = outputs.logits

# Find the [MASK] token position
mask_token_index = torch.where(encoded['input_ids'] == loaded_tokenizer.mask_token_id)[1]

if len(mask_token_index) > 0:
    mask_pos = mask_token_index[0].item()
    predicted_token_id = predictions[0, mask_pos].argmax(axis=-1).item()
    predicted_token = loaded_tokenizer.decode([predicted_token_id])
    
    print(f"\nMask position: {mask_pos}")
    print(f"Predicted token: {predicted_token}")
    print(f"Predicted token ID: {predicted_token_id}")
else:
    print("No [MASK] token found in the sentence")