In [None]:
from transformers import BertTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [None]:
dataset = load_dataset("ErikCikalleshi/new_york_times_news_1987_1995")

In [None]:
unique_dates = list(set(sorted(dataset['train']['date'])))
custom_date_tokens = [f"<year_{d}>" for d in unique_dates]
model_name = "bert-base-uncased"
custom_token = custom_date_tokens
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'extra_special_tokens' : custom_date_tokens})


In [None]:
# Data collator for Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Load pre-trained model
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

In [None]:
training_args = TrainingArguments(
    output_dir="./domain_pretrained_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)

# Pretrain the model
trainer.train()