<a href="https://colab.research.google.com/github/Text-Machine/data-processing-code/blob/main/colab_training_modernbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ModernBERT Pretraining on Historical Texts (Google Colab)

This notebook allows you to pretrain ModernBERT on historical text data (EEBO, ECCO, EVAN) using Google Colab's free GPU.

## Setup Steps:
1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU (T4 recommended)
2. **Upload Data**: Upload your CSV files to Google Drive or upload directly
3. **Run All Cells**: Runtime ‚Üí Run all

## What this does:
- Installs required packages
- Mounts Google Drive (optional)
- Loads CSV data with columns: `author`, `place`, `date`, `page_text`
- Chunks text into 250-token segments with `<date> [TIME]` prefix
- Trains ModernBERT with masked language modeling
- Saves trained model to Google Drive

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è No GPU detected. Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

In [None]:
# # Install required packages
# !pip install -q transformers datasets pandas accelerate

## Download Data from Google Drive

In [None]:
!gdown 11wfdV7j1TBv_i9XOiT8G8V4NxnJTxezz

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("‚úì Libraries imported successfully")

In [None]:
import os
# Create data directory
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)
OUTPUT_DIR = 'output/modernbert_pretrained'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Define preprocessing functions

def load_csv_as_dataset(csv_paths):
    """Load CSV files and convert to Hugging Face Dataset."""
    all_data = []
    
    for csv_path in csv_paths:
        logger.info(f"Loading {Path(csv_path).name}...")
        df = pd.read_csv(csv_path)
        logger.info(f"  Rows: {len(df)}, Columns: {list(df.columns)}")
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    logger.info(f"Total rows: {len(combined_df)}")
    
    dataset = Dataset.from_pandas(combined_df)
    return dataset


def tokenize_and_chunk_function(examples, tokenizer, max_chunk_length=250):
    input_ids_list = []
    attention_masks_list = []

    cls_id = tokenizer.cls_token_id
    sep_id = tokenizer.sep_token_id
    time_id = tokenizer.convert_tokens_to_ids("[TIME]")
    max_len = tokenizer.model_max_length

    for date, text in zip(examples["date"], examples["page_text"]):

        if not text or pd.isna(text) or pd.isna(date):
            continue

        date_ids = tokenizer.encode(str(date).strip(), add_special_tokens=False)
        text_ids = tokenizer.encode(str(text).strip(), add_special_tokens=False)

        reserved = 1 + len(date_ids) + 1 + 1
        max_text_len = max_len - reserved
        if max_text_len <= 0:
            continue

        chunk_size = min(max_chunk_length, max_text_len)

        for start in range(0, len(text_ids), chunk_size):
            chunk = text_ids[start:start + chunk_size]

            ids = [cls_id] + date_ids + [time_id] + chunk + [sep_id]
            ids = ids[:max_len]

            input_ids_list.append(ids)
            attention_masks_list.append([1] * len(ids))

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_masks_list,
    }


print("‚úì Preprocessing functions defined")

In [None]:
# Configuration
CHUNK_LENGTH = 250
BATCH_SIZE = 16  # Reduce if you run out of memory
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_SAMPLES = None  # Set to e.g., 10000 for quick testing
MODEL_NAME = "answerdotai/ModernBERT-base"  # ModernBERT base model

print("Training Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Chunk length: {CHUNK_LENGTH} tokens")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max samples: {MAX_SAMPLES or 'All'}")

In [None]:
# Load data
DATA_DIR = '.'
csv_files = list(Path(DATA_DIR).glob('*.csv'))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {DATA_DIR}. Please upload data first.")

print(f"Found {len(csv_files)} CSV file(s):")
for f in csv_files:
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  - {f.name} ({size_mb:.1f} MB)")

dataset = load_csv_as_dataset(csv_files)
print(f"\nDataset loaded: {len(dataset)} rows")

# Limit samples if specified
if MAX_SAMPLES and MAX_SAMPLES < len(dataset):
    dataset = dataset.select(range(MAX_SAMPLES))
    print(f"Limited to {MAX_SAMPLES} samples for testing")

In [None]:
# Load tokenizer and model
print(f"Loading ModernBERT tokenizer and model from {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add [TIME] special token
if "[TIME]" not in tokenizer.vocab:
    tokenizer.add_tokens(["[TIME]"])
    print("Added [TIME] token to vocabulary")

print(f"Vocabulary size: {len(tokenizer)}")

model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
print(f"Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Preprocess data (tokenize and chunk)
print("Tokenizing and chunking text (this may take several minutes)...")

tokenized_dataset = dataset.map(
    lambda examples: tokenize_and_chunk_function(
        examples,
        tokenizer,
        max_chunk_length=CHUNK_LENGTH
    ),
    batched=True,
    batch_size=50,
    remove_columns=dataset.column_names,
    num_proc=1,
    desc="Tokenizing and chunking"
)

print(f"Tokenized dataset size: {len(tokenized_dataset)} samples")

# Show sample
if len(tokenized_dataset) > 0:
    sample = tokenized_dataset[0]
    print(f"\nSample input (first 100 tokens):")
    print(tokenizer.decode(sample['input_ids'][:100]))

In [None]:
# Split train/validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
# Setup training
OUTPUT_DIR = 'modernbert_pretrained'
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("‚úì Trainer configured")
print(f"\nStarting training with {EPOCHS} epochs...")

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Save model
print(f"Saving model to {OUTPUT_DIR}...")
trainer.push_to_hub()

print("\n‚úì Model saved successfully!")
print(f"\nTo load the model later:")
print(f"  from transformers import AutoModelForMaskedLM, AutoTokenizer")
print(f"  model = AutoModelForMaskedLM.from_pretrained('{OUTPUT_DIR}')")
print(f"  tokenizer = AutoTokenizer.from_pretrained('{OUTPUT_DIR}')")

## Test the Trained Model

Let's test the model with masked language modeling predictions.

In [None]:
# Test MLM predictions
from transformers import pipeline

# Create fill-mask pipeline
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# Test sentences with historical context
test_sentences = [
    "1650 [TIME]  The [MASK] returned to parliament.",
    "1800 [TIME]  The [MASK] returned to parliament.",
]

print("Testing masked language model predictions:\n")
for sentence in test_sentences:
    print(f"Input: {sentence}")
    predictions = fill_mask(sentence, top_k=3)
    for i, pred in enumerate(predictions, 1):
        print(f"  {i}. {pred['token_str']:>12} (score: {pred['score']:.3f})")
    print()

## Download Model (Optional)

If you want to download the trained model to your local machine, run the cell below.

In [None]:
# Zip and download model
import shutil
from google.colab import files

# Create zip file
zip_path = '/content/modernbert_pretrained'
shutil.make_archive(zip_path, 'zip', OUTPUT_DIR)

print(f"Model zipped. Size: {Path(f'{zip_path}.zip').stat().st_size / 1e6:.1f} MB")
print("Downloading...")

# Download
files.download(f'{zip_path}.zip')

<a href="https://colab.research.google.com/github/Text-Machine/data-processing-code/blob/main/colab_training_modernbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ModernBERT Pretraining on Historical Texts (Google Colab)

This notebook allows you to pretrain ModernBERT on historical text data (EEBO, ECCO, EVAN) using Google Colab's free GPU.

## Setup Steps:
1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU (T4 recommended)
2. **Upload Data**: Upload your CSV files to Google Drive or upload directly
3. **Run All Cells**: Runtime ‚Üí Run all

## What this does:
- Installs required packages
- Mounts Google Drive (optional)
- Loads CSV data with columns: `author`, `place`, `date`, `page_text`
- Chunks text into 250-token segments with `<date> [TIME]` prefix
- Trains ModernBERT with masked language modeling
- Saves trained model to Google Drive

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è No GPU detected. Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

In [None]:
# Install required packages
# !pip install -q transformers datasets pandas accelerate

## Download Data from Google Drive

In [None]:
!gdown 11wfdV7j1TBv_i9XOiT8G8V4NxnJTxezz

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("‚úì Libraries imported successfully")

In [None]:
import os
# Create data directory
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)
OUTPUT_DIR = 'output/modernbert_pretrained'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Define preprocessing functions

def load_csv_as_dataset(csv_paths):
    """Load CSV files and convert to Hugging Face Dataset."""
    all_data = []
    
    for csv_path in csv_paths:
        logger.info(f"Loading {Path(csv_path).name}...")
        df = pd.read_csv(csv_path)
        logger.info(f"  Rows: {len(df)}, Columns: {list(df.columns)}")
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    logger.info(f"Total rows: {len(combined_df)}")
    
    dataset = Dataset.from_pandas(combined_df)
    return dataset


def tokenize_and_chunk_function(examples, tokenizer, max_chunk_length=250):
    input_ids_list = []
    attention_masks_list = []

    cls_id = tokenizer.cls_token_id
    sep_id = tokenizer.sep_token_id
    time_id = tokenizer.convert_tokens_to_ids("[TIME]")
    max_len = tokenizer.model_max_length

    for date, text in zip(examples["date"], examples["page_text"]):

        if not text or pd.isna(text) or pd.isna(date):
            continue

        date_ids = tokenizer.encode(str(date).strip(), add_special_tokens=False)
        text_ids = tokenizer.encode(str(text).strip(), add_special_tokens=False)

        reserved = 1 + len(date_ids) + 1 + 1
        max_text_len = max_len - reserved
        if max_text_len <= 0:
            continue

        chunk_size = min(max_chunk_length, max_text_len)

        for start in range(0, len(text_ids), chunk_size):
            chunk = text_ids[start:start + chunk_size]

            ids = [cls_id] + date_ids + [time_id] + chunk + [sep_id]
            ids = ids[:max_len]

            input_ids_list.append(ids)
            attention_masks_list.append([1] * len(ids))

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_masks_list,
    }


print("‚úì Preprocessing functions defined")

In [None]:
# Configuration
CHUNK_LENGTH = 250
BATCH_SIZE = 16  # Reduce if you run out of memory
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_SAMPLES = None  # Set to e.g., 10000 for quick testing
MODEL_NAME = "answerdotai/ModernBERT-base"  # ModernBERT base model

print("Training Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Chunk length: {CHUNK_LENGTH} tokens")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max samples: {MAX_SAMPLES or 'All'}")

In [None]:
# Load data
DATA_DIR = '.'
csv_files = list(Path(DATA_DIR).glob('*.csv'))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {DATA_DIR}. Please upload data first.")

print(f"Found {len(csv_files)} CSV file(s):")
for f in csv_files:
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  - {f.name} ({size_mb:.1f} MB)")

dataset = load_csv_as_dataset(csv_files)
print(f"\nDataset loaded: {len(dataset)} rows")

# Limit samples if specified
if MAX_SAMPLES and MAX_SAMPLES < len(dataset):
    dataset = dataset.select(range(MAX_SAMPLES))
    print(f"Limited to {MAX_SAMPLES} samples for testing")

In [None]:
# Load tokenizer and model
print(f"Loading ModernBERT tokenizer and model from {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add [TIME] special token
if "[TIME]" not in tokenizer.vocab:
    tokenizer.add_tokens(["[TIME]"])
    print("Added [TIME] token to vocabulary")

print(f"Vocabulary size: {len(tokenizer)}")

model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
print(f"Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Preprocess data (tokenize and chunk)
print("Tokenizing and chunking text (this may take several minutes)...")

tokenized_dataset = dataset.map(
    lambda examples: tokenize_and_chunk_function(
        examples,
        tokenizer,
        max_chunk_length=CHUNK_LENGTH
    ),
    batched=True,
    batch_size=50,
    remove_columns=dataset.column_names,
    num_proc=1,
    desc="Tokenizing and chunking"
)

print(f"Tokenized dataset size: {len(tokenized_dataset)} samples")

# Show sample
if len(tokenized_dataset) > 0:
    sample = tokenized_dataset[0]
    print(f"\nSample input (first 100 tokens):")
    print(tokenizer.decode(sample['input_ids'][:100]))

In [None]:
# Split train/validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
# Setup training
OUTPUT_DIR = 'modernbert_pretrained'
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("‚úì Trainer configured")
print(f"\nStarting training with {EPOCHS} epochs...")

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Save model
print(f"Saving model to {OUTPUT_DIR}...")
trainer.push_to_hub()

print("\n‚úì Model saved successfully!")
print(f"\nTo load the model later:")
print(f"  from transformers import AutoModelForMaskedLM, AutoTokenizer")
print(f"  model = AutoModelForMaskedLM.from_pretrained('{OUTPUT_DIR}')")
print(f"  tokenizer = AutoTokenizer.from_pretrained('{OUTPUT_DIR}')")

## Test the Trained Model

Let's test the model with masked language modeling predictions.

In [None]:
# Test MLM predictions
from transformers import pipeline

# Create fill-mask pipeline
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# Test sentences with historical context
test_sentences = [
    "1650 [TIME] The [MASK] returned to parliament.",
    "1800 [TIME] The [MASK] returned to parliament.",
]

print("Testing masked language model predictions:\n")
for sentence in test_sentences:
    print(f"Input: {sentence}")
    predictions = fill_mask(sentence, top_k=3)
    for i, pred in enumerate(predictions, 1):
        print(f"  {i}. {pred['token_str']:>12} (score: {pred['score']:.3f})")
    print()

## Download Model (Optional)

If you want to download the trained model to your local machine, run the cell below.

In [None]:
# Zip and download model
import shutil
from google.colab import files

# Create zip file
zip_path = '/content/modernbert_pretrained'
shutil.make_archive(zip_path, 'zip', OUTPUT_DIR)

print(f"Model zipped. Size: {Path(f'{zip_path}.zip').stat().st_size / 1e6:.1f} MB")
print("Downloading...")

# Download
files.download(f'{zip_path}.zip')

<a href="https://colab.research.google.com/github/Text-Machine/data-processing-code/blob/main/colab_training_modernbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ModernBERT Pretraining on Historical Texts (Google Colab)

This notebook allows you to pretrain ModernBERT on historical text data (EEBO, ECCO, EVAN) using Google Colab's free GPU.

## Setup Steps:
1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU (T4 recommended)
2. **Upload Data**: Upload your CSV files to Google Drive or upload directly
3. **Run All Cells**: Runtime ‚Üí Run all

## What this does:
- Installs required packages
- Mounts Google Drive (optional)
- Loads CSV data with columns: `author`, `place`, `date`, `page_text`
- Chunks text into 250-token segments with `<date> [TIME]` prefix
- Trains ModernBERT with masked language modeling
- Saves trained model to Google Drive

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è No GPU detected. Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

In [None]:
# # Install required packages
# !pip install -q transformers datasets pandas accelerate

## Download Data from Google Drive

In [None]:
!gdown 11wfdV7j1TBv_i9XOiT8G8V4NxnJTxezz

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("‚úì Libraries imported successfully")

In [None]:
import os
# Create data directory
# Update paths
DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)
OUTPUT_DIR = 'output/modernbert_pretrained'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Define preprocessing functions

def load_csv_as_dataset(csv_paths):
    """Load CSV files and convert to Hugging Face Dataset."""
    all_data = []
    
    for csv_path in csv_paths:
        logger.info(f"Loading {Path(csv_path).name}...")
        df = pd.read_csv(csv_path)
        logger.info(f"  Rows: {len(df)}, Columns: {list(df.columns)}")
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    logger.info(f"Total rows: {len(combined_df)}")
    
    dataset = Dataset.from_pandas(combined_df)
    return dataset


def tokenize_and_chunk_function(examples, tokenizer, max_chunk_length=250):
    input_ids_list = []
    attention_masks_list = []

    cls_id = tokenizer.cls_token_id
    sep_id = tokenizer.sep_token_id
    time_id = tokenizer.convert_tokens_to_ids("[TIME]")
    max_len = tokenizer.model_max_length

    for date, text in zip(examples["date"], examples["page_text"]):

        if not text or pd.isna(text) or pd.isna(date):
            continue

        date_ids = tokenizer.encode(str(date).strip(), add_special_tokens=False)
        text_ids = tokenizer.encode(str(text).strip(), add_special_tokens=False)

        reserved = 1 + len(date_ids) + 1 + 1
        max_text_len = max_len - reserved
        if max_text_len <= 0:
            continue

        chunk_size = min(max_chunk_length, max_text_len)

        for start in range(0, len(text_ids), chunk_size):
            chunk = text_ids[start:start + chunk_size]

            ids = [cls_id] + date_ids + [time_id] + chunk + [sep_id]
            ids = ids[:max_len]

            input_ids_list.append(ids)
            attention_masks_list.append([1] * len(ids))

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_masks_list,
    }


print("‚úì Preprocessing functions defined")

In [None]:
# Configuration
CHUNK_LENGTH = 250
BATCH_SIZE = 16  # Reduce if you run out of memory
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_SAMPLES = None  # Set to e.g., 10000 for quick testing
MODEL_NAME = "answerdotai/ModernBERT-base"  # ModernBERT base model

print("Training Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Chunk length: {CHUNK_LENGTH} tokens")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max samples: {MAX_SAMPLES or 'All'}")

In [None]:
# Load data
DATA_DIR = '.'
csv_files = list(Path(DATA_DIR).glob('*.csv'))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {DATA_DIR}. Please upload data first.")

print(f"Found {len(csv_files)} CSV file(s):")
for f in csv_files:
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"  - {f.name} ({size_mb:.1f} MB)")

dataset = load_csv_as_dataset(csv_files)
print(f"\nDataset loaded: {len(dataset)} rows")

# Limit samples if specified
if MAX_SAMPLES and MAX_SAMPLES < len(dataset):
    dataset = dataset.select(range(MAX_SAMPLES))
    print(f"Limited to {MAX_SAMPLES} samples for testing")

In [None]:
# Load tokenizer and model
print(f"Loading ModernBERT tokenizer and model from {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add [TIME] special token
if "[TIME]" not in tokenizer.vocab:
    tokenizer.add_tokens(["[TIME]"])
    print("Added [TIME] token to vocabulary")

print(f"Vocabulary size: {len(tokenizer)}")

model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
print(f"Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Preprocess data (tokenize and chunk)
print("Tokenizing and chunking text (this may take several minutes)...")

tokenized_dataset = dataset.map(
    lambda examples: tokenize_and_chunk_function(
        examples,
        tokenizer,
        max_chunk_length=CHUNK_LENGTH
    ),
    batched=True,
    batch_size=50,
    remove_columns=dataset.column_names,   # üí• remove EVERYTHING old
    num_proc=1,
    desc="Tokenizing and chunking"
)

print(f"Tokenized dataset size: {len(tokenized_dataset)} samples")

# Show sample
if len(tokenized_dataset) > 0:
    sample = tokenized_dataset[0]
    print(f"\nSample input (first 100 tokens):")
    print(tokenizer.decode(sample['input_ids'][:100]))

In [None]:
# Split train/validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
# Setup training
OUTPUT_DIR = 'modernbert_pretrained'
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    #overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("‚úì Trainer configured")
print(f"\nStarting training with {EPOCHS} epochs...")

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Save model
print(f"Saving model to {OUTPUT_DIR}...")
# model.push_to_hub()
# tokenizer.push_to_hub()
trainer.push_to_hub()

print("\n‚úì Model saved successfully!")
print(f"\nTo load the model later:")
print(f"  from transformers import AutoModelForMaskedLM, AutoTokenizer")
print(f"  model = AutoModelForMaskedLM.from_pretrained('{OUTPUT_DIR}')")
print(f"  tokenizer = AutoTokenizer.from_pretrained('{OUTPUT_DIR}')")

## Test the Trained Model

Let's test the model with masked language modeling predictions.

In [None]:
# Test MLM predictions
from transformers import pipeline

# Create fill-mask pipeline
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

# Test sentences with historical context
test_sentences = [
    "1650 [TIME]  The [MASK] returned to parliament.",
    "1800 [TIME]  The [MASK] returned to parliament.",
]

print("Testing masked language model predictions:\n")
for sentence in test_sentences:
    print(f"Input: {sentence}")
    predictions = fill_mask(sentence, top_k=3)
    for i, pred in enumerate(predictions, 1):
        print(f"  {i}. {pred['token_str']:>12} (score: {pred['score']:.3f})")
    print()

## Download Model (Optional)

If you want to download the trained model to your local machine, run the cell below.

In [None]:
# Zip and download model
import shutil
from google.colab import files

# Create zip file
zip_path = '/content/modernbert_pretrained'
shutil.make_archive(zip_path, 'zip', OUTPUT_DIR)

print(f"Model zipped. Size: {Path(f'{zip_path}.zip').stat().st_size / 1e6:.1f} MB")
print("Downloading...")

# Download
files.download(f'{zip_path}.zip')