<a href="https://colab.research.google.com/github/Text-Machine/data-processing-code/blob/main/colab_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Pretraining on Historical Texts (Google Colab)

This notebook allows you to pretrain BERT on historical text data (EEBO, ECCO, EVAN) using Google Colab's free GPU.

## Setup Steps:
1. **Enable GPU**: Runtime → Change runtime type → GPU (T4 recommended)
2. **Upload Data**: Upload your CSV files to Google Drive or upload directly
3. **Run All Cells**: Runtime → Run all

## What this does:
- Installs required packages
- Mounts Google Drive (optional)
- Loads CSV data with columns: `author`, `place`, `date`, `page_text`
- Chunks text into 250-token segments with `[TIME] <date>` prefix
- Trains BERT with masked language modeling
- Saves trained model to Google Drive

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️ No GPU detected. Go to Runtime → Change runtime type → GPU")

In [None]:
# Install required packages
!pip install -q transformers datasets pandas accelerate

In [None]:
# Mount Google Drive (optional - for loading data and saving models)
from google.colab import drive
drive.mount('/content/drive')

# Set paths (adjust these to your Google Drive structure)
DATA_DIR = '/content/drive/MyDrive/text-machine-data'  # Where your CSV files are
OUTPUT_DIR = '/content/drive/MyDrive/bert-pretrained'  # Where to save the model

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

## Option 1: Use Data from Google Drive

If you have CSV files in Google Drive, use the cell above.

## Option 2: Upload Data Directly

Run the cell below to upload CSV files directly to Colab (note: files will be deleted when runtime disconnects).

In [None]:
# Option 2: Upload files directly to Colab
from google.colab import files
import os

# Create data directory
os.makedirs('data', exist_ok=True)

print("Upload your CSV files (must have columns: author, place, date, page_text)")
uploaded = files.upload()

# Move uploaded files to data directory
for filename in uploaded.keys():
    os.rename(filename, f'data/{filename}')
    print(f"Uploaded: data/{filename} ({len(uploaded[filename])/1e6:.1f} MB)")

# Update paths
DATA_DIR = 'data'
OUTPUT_DIR = 'output/bert_pretrained'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import BertTokenizer, BertForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✓ Libraries imported successfully")

In [None]:
# Define preprocessing functions

def load_csv_as_dataset(csv_paths):
    """Load CSV files and convert to Hugging Face Dataset."""
    all_data = []
    
    for csv_path in csv_paths:
        logger.info(f"Loading {Path(csv_path).name}...")
        df = pd.read_csv(csv_path)
        logger.info(f"  Rows: {len(df)}, Columns: {list(df.columns)}")
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    logger.info(f"Total rows: {len(combined_df)}")
    
    dataset = Dataset.from_pandas(combined_df)
    return dataset


def tokenize_and_chunk_function(examples, tokenizer, max_chunk_length=250):
    """Tokenize text and create chunks with date prefix."""
    batch_size = len(examples['date'])
    all_input_ids = []
    all_attention_masks = []
    
    for idx in range(batch_size):
        date = examples['date'][idx]
        text = examples['page_text'][idx]
        
        if not text or pd.isna(text) or pd.isna(date):
            continue
        
        date_str = str(date).strip()
        text_str = str(text).strip()
        
        # Tokenize date and text
        date_tokens = tokenizer.tokenize(date_str)
        text_tokens = tokenizer.tokenize(text_str)
        
        # Create chunks: [CLS] [TIME] <date> <text_chunk> [SEP]
        for chunk_start in range(0, len(text_tokens), max_chunk_length):
            chunk_end = min(chunk_start + max_chunk_length, len(text_tokens))
            chunk_tokens = text_tokens[chunk_start:chunk_end]
            
            tokens = [tokenizer.cls_token, "[TIME]"] + date_tokens + chunk_tokens + [tokenizer.sep_token]
            
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            attention_mask = [1] * len(input_ids)
            
            if len(input_ids) <= 512:
                all_input_ids.append(input_ids)
                all_attention_masks.append(attention_mask)
    
    return {
        'input_ids': all_input_ids,
        'attention_mask': all_attention_masks,
    }

print("✓ Preprocessing functions defined")

In [None]:
# Configuration
CHUNK_LENGTH = 250
BATCH_SIZE = 16  # Reduce if you run out of memory
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_SAMPLES = None  # Set to e.g., 10000 for quick testing

print("Training Configuration:")
print(f"  Chunk length: {CHUNK_LENGTH} tokens")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max samples: {MAX_SAMPLES or 'All'}")

In [None]:
# Load data
csv_files = list(Path(DATA_DIR).glob('*.csv'))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {DATA_DIR}. Please upload data first.")

print(f"Found {len(csv_files)} CSV file(s):")
for f in csv_files:
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  - {f.name} ({size_mb:.1f} MB)")

dataset = load_csv_as_dataset(csv_files)
print(f"\nDataset loaded: {len(dataset)} rows")

# Limit samples if specified
if MAX_SAMPLES and MAX_SAMPLES < len(dataset):
    dataset = dataset.select(range(MAX_SAMPLES))
    print(f"Limited to {MAX_SAMPLES} samples for testing")

In [None]:
# Load tokenizer and model
print("Loading BERT tokenizer and model...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Add [TIME] special token
if "[TIME]" not in tokenizer.vocab:
    tokenizer.add_tokens(["[TIME]"])
    print("Added [TIME] token to vocabulary")

print(f"Vocabulary size: {len(tokenizer)}")

model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.resize_token_embeddings(len(tokenizer))
print(f"Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Preprocess data (tokenize and chunk)
print("Tokenizing and chunking text (this may take several minutes)...")

tokenized_dataset = dataset.map(
    lambda examples: tokenize_and_chunk_function(
        examples, 
        tokenizer, 
        max_chunk_length=CHUNK_LENGTH
    ),
    batched=True,
    batch_size=1000,
    remove_columns=['author', 'place', 'date', 'page_text'],
    num_proc=2,
)

print(f"Tokenized dataset size: {len(tokenized_dataset)} samples")

# Show sample
if len(tokenized_dataset) > 0:
    sample = tokenized_dataset[0]
    print(f"\nSample input (first 100 tokens):")
    print(tokenizer.decode(sample['input_ids'][:100]))

In [None]:
# Split train/validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
# Setup training
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("✓ Trainer configured")
print(f"\nStarting training with {EPOCHS} epochs...")

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Save model
print(f"Saving model to {OUTPUT_DIR}...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("\n✓ Model saved successfully!")
print(f"\nTo load the model later:")
print(f"  from transformers import BertForMaskedLM, BertTokenizer")
print(f"  model = BertForMaskedLM.from_pretrained('{OUTPUT_DIR}')")
print(f"  tokenizer = BertTokenizer.from_pretrained('{OUTPUT_DIR}')")

## Test the Trained Model

Let's test the model with masked language modeling predictions.

In [None]:
# Test MLM predictions
from transformers import pipeline

# Create fill-mask pipeline
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# Test sentences with historical context
test_sentences = [
    "[TIME] 1650 The king [MASK] to parliament.",
    "[TIME] 1700 The book was [MASK] in London.",
    "[TIME] 1600 He was a [MASK] man.",
]

print("Testing masked language model predictions:\n")
for sentence in test_sentences:
    print(f"Input: {sentence}")
    predictions = fill_mask(sentence, top_k=3)
    for i, pred in enumerate(predictions, 1):
        print(f"  {i}. {pred['token_str']:>12} (score: {pred['score']:.3f})")
    print()

## Download Model (Optional)

If you want to download the trained model to your local machine, run the cell below.

In [None]:
# Zip and download model
import shutil
from google.colab import files

# Create zip file
zip_path = '/content/bert_pretrained'
shutil.make_archive(zip_path, 'zip', OUTPUT_DIR)

print(f"Model zipped. Size: {Path(f'{zip_path}.zip').stat().st_size / 1e6:.1f} MB")
print("Downloading...")

# Download
files.download(f'{zip_path}.zip')