# BERT

In [1]:
import os
import time
from datetime import datetime
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    set_seed,
)
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Used Optuna for hyperparameter search
def hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "seed": trial.suggest_categorical("seed", [42, 1234, 2021]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "per_device_eval_batch_size": trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
    }

def compute_objective(metrics):
    """
    Objective function for hyperparameter search.
    We want to minimize the validation loss. The Trainer by default returns 'eval_loss'.
    """
    return metrics["eval_loss"]

In [3]:
splits = {
    'train': 'openassistant_best_replies_train.jsonl',
    'test':  'openassistant_best_replies_eval.jsonl'
}

model_checkpoint = "bert-base-uncased"

output_dir = "./Models/Bert/v3/bert-finetuned-mlm"

default_num_train_epochs = 3
default_train_batch_size = 8
default_eval_batch_size = 8
default_learning_rate = 5e-5
default_weight_decay = 0.01
seed = 42

do_hpo = True  

n_hpo_trials = 10

In [4]:
def main():
    set_seed(seed)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    print("Loading dataset from Hugging Face...")
    
    train_data = pd.read_json(f"hf://datasets/timdettmers/openassistant-guanaco/{splits['train']}", lines=True)
    test_data = pd.read_json(f"hf://datasets/timdettmers/openassistant-guanaco/{splits['test']}", lines=True)

    train_dataset = Dataset.from_dict({"text": train_data["text"].tolist()})
    test_dataset = Dataset.from_dict({"text": test_data["text"].tolist()})

    print(f"Number of training samples: {len(train_dataset)}")
    print(f"Number of test samples:     {len(test_dataset)}")

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )

    print("Tokenizing dataset...")
    train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    test_tokenized = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    def model_init():
        return AutoModelForMaskedLM.from_pretrained(model_checkpoint)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=100,
        num_train_epochs=default_num_train_epochs,
        per_device_train_batch_size=default_train_batch_size,
        per_device_eval_batch_size=default_eval_batch_size,
        learning_rate=default_learning_rate,
        weight_decay=default_weight_decay,
        seed=seed,
    )

    trainer = Trainer(
        model_init=model_init if do_hpo else None,  
        model=None if do_hpo else AutoModelForMaskedLM.from_pretrained(model_checkpoint),
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        data_collator=data_collator,
    )

    if do_hpo:
        print("\n=== Running hyperparameter search ===")
        best_run = trainer.hyperparameter_search(
            direction="minimize",
            hp_space=hp_space_optuna,
            compute_objective=compute_objective,
            n_trials=n_hpo_trials
        )
        print("Best hyperparameters found:", best_run.hyperparameters)

        for k, v in best_run.hyperparameters.items():
            setattr(trainer.args, k, v)

        trainer.model = trainer.model_init()
    else:
        print("Skipping hyperparameter search...")

    print("\n=== Starting training ===")
    trainer.train()

    print("\n=== Evaluating on test dataset ===")
    eval_metrics = trainer.evaluate()
    print(f"Eval metrics: {eval_metrics}")
    if "eval_loss" in eval_metrics:
        perplexity = torch.exp(torch.tensor(eval_metrics["eval_loss"]))
        print(f"Perplexity: {perplexity.item():.2f}")

    print(f"\n=== Saving final model to {output_dir} ===")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("All done!")

In [6]:
if __name__ == "__main__":

    start_time = time.time()
    print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    main()

    end_time = time.time()
    print(f"End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    time_taken = end_time - start_time
    print(f"Time taken to execute the loop: {time_taken:.2f} seconds")



Start time: 2025-01-12 19:08:08
Using device: cpu
Loading dataset from Hugging Face...
Number of training samples: 9846
Number of test samples:     518
Tokenizing dataset...


Map: 100%|██████████| 9846/9846 [00:01<00:00, 5959.86 examples/s]
Map: 100%|██████████| 518/518 [00:00<00:00, 6076.73 examples/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[I 2025-01-12 19:08:13,377] A new study created in memory with name: no-name-bed693f6-dbbc-47ae-adfc-53ea8e58fca5



=== Running hyperparameter search ===


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss


[W 2025-01-12 19:08:23,940] Trial 0 failed with parameters: {'learning_rate': 2.6838614665669874e-05, 'num_train_epochs': 5, 'seed': 2021, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'weight_decay': 0.043818773275399986} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/alfiovavassori/Library/Mobile Documents/com~apple~CloudDocs/Documents/USI/Master/3rd Semester/Advanced Topics in Machine Learning/Project 2/ATML_2/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/alfiovavassori/Library/Mobile Documents/com~apple~CloudDocs/Documents/USI/Master/3rd Semester/Advanced Topics in Machine Learning/Project 2/ATML_2/.venv/lib/python3.12/site-packages/transformers/integrations/integration_utils.py", line 250, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/User

KeyboardInterrupt: 