In [1]:
import sys
import os
import random
import pickle
import logging
import numpy as np
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from peft import get_peft_model, LoraConfig, TaskType
from transformers import BertForMaskedLM, BertTokenizerFast, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, random_split, Dataset, TensorDataset
from preprocessing import make_delayed, downsample_word_vectors
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt


# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Add the root project folder to sys.path (so ridge_utils becomes importable)
project_root = os.path.abspath('..')  # moves up from 'code/'
sys.path.append(project_root)

# Load the raw_text.pkl file
path_to_data = '../data'
with open(f'{path_to_data}/raw_text.pkl', 'rb') as f:
    raw_text = pickle.load(f)

In [2]:
# Define valid stories
all_stories = list(raw_text.keys())
exclude = [
    'dialogue1',
    'dialogue2',
    'dialogue3',
    'dialogue4',
    'dialogue5',
    'dialogue6',
    'myfirstdaywiththeyankees',
    'onlyonewaytofindout'
]
valid_stories = [story for story in all_stories if story not in exclude]

# Split train/test stories
all_stories = valid_stories
random.shuffle(all_stories)
split_idx = int(0.7 * len(all_stories))
train_stories = all_stories[:split_idx]
test_stories = all_stories[split_idx:]
print(f"Train stories: {len(train_stories)}")
print(f"Test stories: {len(test_stories)}")
train_stories.sort()
test_stories.sort()

Train stories: 70
Test stories: 31


In [3]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Simple function to prepare MLM data
def prepare_mlm_data(stories, tokenizer, max_length=512):
    texts = [" ".join(raw_text[story].data) for story in stories]
    inputs = []
    labels = []
    for text in texts:
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        label_ids = input_ids.clone()
        rand = torch.rand(input_ids.shape)
        mask_arr = (rand < 0.15) * (input_ids != 101) * (input_ids != 102) * (input_ids != 0)
        selection = torch.flatten(mask_arr.nonzero())
        input_ids[selection] = 103  # [MASK] token
        inputs.append(input_ids)
        labels.append(label_ids)
    return torch.stack(inputs), torch.stack(labels)

In [7]:
# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["key", "query", "value"]
)

# Hyperparameters
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4]
batch_sizes = [4, 8, 16]
epochs = 10
gradient_accumulation_steps = 4
max_grad_norm = 1.0
early_stopping_patience = 5
save_dir = "./best_lora"
os.makedirs(save_dir, exist_ok=True)

# Track best model
best_val_loss = float("inf")
best_hyperparameters = {}

In [8]:
# Grid search
for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        print(f"\nTraining with Learning Rate: {learning_rate} and Batch Size: {batch_size}")

        # Prepare data
        train_inputs, train_labels = prepare_mlm_data(train_stories, tokenizer)
        val_inputs, val_labels = prepare_mlm_data(test_stories, tokenizer)

        # Create dataloaders
        train_dataset = TensorDataset(train_inputs, train_labels)
        val_dataset = TensorDataset(val_inputs, val_labels)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        model = get_peft_model(model, lora_config)
        model.to(device)

        # Optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs // gradient_accumulation_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps
        )

        # Training loop
        train_losses = []
        val_losses = []
        no_improvement_epochs = 0
        for epoch in range(epochs):
            model.train()
            epoch_loss = 0
            progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=False)
            for step, (input_ids, labels) in enumerate(progress_bar):
                input_ids = input_ids.to(device)
                labels = labels.to(device)

                outputs = model(input_ids=input_ids, labels=labels)
                loss = outputs.loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

                epoch_loss += loss.item() * gradient_accumulation_steps
                if (step + 1) % 10 == 0:
                    progress_bar.set_postfix({"Train Loss": loss.item() * gradient_accumulation_steps})

            avg_train_loss = epoch_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")

            # Validation
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for input_ids, labels in val_loader:
                    input_ids = input_ids.to(device)
                    labels = labels.to(device)
                    outputs = model(input_ids=input_ids, labels=labels)
                    val_loss += outputs.loss.item()

            avg_val_loss = val_loss / len(val_loader)
            val_losses.append(avg_val_loss)
            print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}")

            # Early stopping
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                no_improvement_epochs = 0
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
                best_hyperparameters = {'learning_rate': learning_rate, 'batch_size': batch_size}
                print(f"Saving new best model with validation loss: {best_val_loss:.4f}")
            else:
                no_improvement_epochs += 1
                if no_improvement_epochs >= early_stopping_patience:
                    print(f"Early stopping triggered after {early_stopping_patience} epochs.")
                    break

        # Plot losses
        plt.figure(figsize=(10, 6))
        sns.set_style("whitegrid")
        epochs_range = range(1, len(train_losses) + 1)
        sns.lineplot(x=epochs_range, y=train_losses, label="Training Loss", marker="o", color="blue")
        sns.lineplot(x=epochs_range, y=val_losses, label="Validation Loss", marker="x", color="orange")
        plt.title(f"Training and Validation Loss (LR: {learning_rate}, Batch Size: {batch_size})")
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.legend()
        plt.savefig(f"{save_dir}/losses_lr_{learning_rate}_bs_{batch_size}.png")
        plt.close()

print("\nTraining complete!")
print(f"Best model saved at {save_dir}")
print(f"Best Hyperparameters: {best_hyperparameters}")


Training with Learning Rate: 1e-05 and Batch Size: 4


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6593
Epoch 1 - Validation Loss: 0.5862
Saving new best model with validation loss: 0.5862


Training Epoch 2:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6684
Epoch 2 - Validation Loss: 0.5839
Saving new best model with validation loss: 0.5839


Training Epoch 3:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6579
Epoch 3 - Validation Loss: 0.5818
Saving new best model with validation loss: 0.5818


Training Epoch 4:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6538
Epoch 4 - Validation Loss: 0.5799
Saving new best model with validation loss: 0.5799


Training Epoch 5:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6554
Epoch 5 - Validation Loss: 0.5783
Saving new best model with validation loss: 0.5783


Training Epoch 6:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 6 - Training Loss: 0.6521
Epoch 6 - Validation Loss: 0.5769
Saving new best model with validation loss: 0.5769


Training Epoch 7:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 7 - Training Loss: 0.6497
Epoch 7 - Validation Loss: 0.5757
Saving new best model with validation loss: 0.5757


Training Epoch 8:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 8 - Training Loss: 0.6478
Epoch 8 - Validation Loss: 0.5747
Saving new best model with validation loss: 0.5747


Training Epoch 9:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 9 - Training Loss: 0.6488
Epoch 9 - Validation Loss: 0.5740
Saving new best model with validation loss: 0.5740


Training Epoch 10:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 10 - Training Loss: 0.6433
Epoch 10 - Validation Loss: 0.5735
Saving new best model with validation loss: 0.5735

Training with Learning Rate: 1e-05 and Batch Size: 8


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6793
Epoch 1 - Validation Loss: 0.5632
Saving new best model with validation loss: 0.5632


Training Epoch 2:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6832
Epoch 2 - Validation Loss: 0.5620
Saving new best model with validation loss: 0.5620


Training Epoch 3:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6772
Epoch 3 - Validation Loss: 0.5610
Saving new best model with validation loss: 0.5610


Training Epoch 4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6756
Epoch 4 - Validation Loss: 0.5600
Saving new best model with validation loss: 0.5600


Training Epoch 5:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6780
Epoch 5 - Validation Loss: 0.5592
Saving new best model with validation loss: 0.5592


Training Epoch 6:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 6 - Training Loss: 0.6723
Epoch 6 - Validation Loss: 0.5585
Saving new best model with validation loss: 0.5585


Training Epoch 7:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 7 - Training Loss: 0.6744
Epoch 7 - Validation Loss: 0.5579
Saving new best model with validation loss: 0.5579


Training Epoch 8:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 8 - Training Loss: 0.6743
Epoch 8 - Validation Loss: 0.5574
Saving new best model with validation loss: 0.5574


Training Epoch 9:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 9 - Training Loss: 0.6763
Epoch 9 - Validation Loss: 0.5571
Saving new best model with validation loss: 0.5571


Training Epoch 10:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 10 - Training Loss: 0.6728
Epoch 10 - Validation Loss: 0.5569
Saving new best model with validation loss: 0.5569

Training with Learning Rate: 1e-05 and Batch Size: 16


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6681
Epoch 1 - Validation Loss: 0.5913


Training Epoch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6727
Epoch 2 - Validation Loss: 0.5907


Training Epoch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6623
Epoch 3 - Validation Loss: 0.5902


Training Epoch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6711
Epoch 4 - Validation Loss: 0.5897


Training Epoch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6636
Epoch 5 - Validation Loss: 0.5893
Early stopping triggered after 5 epochs.

Training with Learning Rate: 2e-05 and Batch Size: 4


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6531
Epoch 1 - Validation Loss: 0.5725


Training Epoch 2:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6476
Epoch 2 - Validation Loss: 0.5676


Training Epoch 3:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6401
Epoch 3 - Validation Loss: 0.5631


Training Epoch 4:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6316
Epoch 4 - Validation Loss: 0.5591


Training Epoch 5:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6273
Epoch 5 - Validation Loss: 0.5556
Saving new best model with validation loss: 0.5556


Training Epoch 6:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 6 - Training Loss: 0.6265
Epoch 6 - Validation Loss: 0.5525
Saving new best model with validation loss: 0.5525


Training Epoch 7:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 7 - Training Loss: 0.6153
Epoch 7 - Validation Loss: 0.5500
Saving new best model with validation loss: 0.5500


Training Epoch 8:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 8 - Training Loss: 0.6128
Epoch 8 - Validation Loss: 0.5480
Saving new best model with validation loss: 0.5480


Training Epoch 9:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 9 - Training Loss: 0.6177
Epoch 9 - Validation Loss: 0.5466
Saving new best model with validation loss: 0.5466


Training Epoch 10:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 10 - Training Loss: 0.6131
Epoch 10 - Validation Loss: 0.5457
Saving new best model with validation loss: 0.5457

Training with Learning Rate: 2e-05 and Batch Size: 8


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6699
Epoch 1 - Validation Loss: 0.5809


Training Epoch 2:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6697
Epoch 2 - Validation Loss: 0.5784


Training Epoch 3:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6661
Epoch 3 - Validation Loss: 0.5761


Training Epoch 4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6611
Epoch 4 - Validation Loss: 0.5740


Training Epoch 5:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6639
Epoch 5 - Validation Loss: 0.5722
Early stopping triggered after 5 epochs.

Training with Learning Rate: 2e-05 and Batch Size: 16


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6678
Epoch 1 - Validation Loss: 0.5779


Training Epoch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6707
Epoch 2 - Validation Loss: 0.5767


Training Epoch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6641
Epoch 3 - Validation Loss: 0.5756


Training Epoch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6550
Epoch 4 - Validation Loss: 0.5747


Training Epoch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6669
Epoch 5 - Validation Loss: 0.5738
Early stopping triggered after 5 epochs.

Training with Learning Rate: 3e-05 and Batch Size: 4


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6506
Epoch 1 - Validation Loss: 0.6011


Training Epoch 2:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6422
Epoch 2 - Validation Loss: 0.5950


Training Epoch 3:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6336
Epoch 3 - Validation Loss: 0.5892


Training Epoch 4:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6253
Epoch 4 - Validation Loss: 0.5840


Training Epoch 5:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6253
Epoch 5 - Validation Loss: 0.5793
Early stopping triggered after 5 epochs.

Training with Learning Rate: 3e-05 and Batch Size: 8


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6669
Epoch 1 - Validation Loss: 0.5610


Training Epoch 2:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6573
Epoch 2 - Validation Loss: 0.5576


Training Epoch 3:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6518
Epoch 3 - Validation Loss: 0.5545


Training Epoch 4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6485
Epoch 4 - Validation Loss: 0.5517


Training Epoch 5:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6541
Epoch 5 - Validation Loss: 0.5493
Early stopping triggered after 5 epochs.

Training with Learning Rate: 3e-05 and Batch Size: 16


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6716
Epoch 1 - Validation Loss: 0.5833


Training Epoch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6627
Epoch 2 - Validation Loss: 0.5812


Training Epoch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6611
Epoch 3 - Validation Loss: 0.5793


Training Epoch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6622
Epoch 4 - Validation Loss: 0.5776


Training Epoch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6702
Epoch 5 - Validation Loss: 0.5761
Early stopping triggered after 5 epochs.

Training with Learning Rate: 5e-05 and Batch Size: 4


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6398
Epoch 1 - Validation Loss: 0.5727


Training Epoch 2:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6215
Epoch 2 - Validation Loss: 0.5602


Training Epoch 3:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6075
Epoch 3 - Validation Loss: 0.5492


Training Epoch 4:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6005
Epoch 4 - Validation Loss: 0.5401
Saving new best model with validation loss: 0.5401


Training Epoch 5:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.5885
Epoch 5 - Validation Loss: 0.5329
Saving new best model with validation loss: 0.5329


Training Epoch 6:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 6 - Training Loss: 0.5784
Epoch 6 - Validation Loss: 0.5273
Saving new best model with validation loss: 0.5273


Training Epoch 7:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 7 - Training Loss: 0.5762
Epoch 7 - Validation Loss: 0.5230
Saving new best model with validation loss: 0.5230


Training Epoch 8:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 8 - Training Loss: 0.5679
Epoch 8 - Validation Loss: 0.5198
Saving new best model with validation loss: 0.5198


Training Epoch 9:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 9 - Training Loss: 0.5710
Epoch 9 - Validation Loss: 0.5175
Saving new best model with validation loss: 0.5175


Training Epoch 10:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 10 - Training Loss: 0.5651
Epoch 10 - Validation Loss: 0.5161
Saving new best model with validation loss: 0.5161

Training with Learning Rate: 5e-05 and Batch Size: 8


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6874
Epoch 1 - Validation Loss: 0.5916


Training Epoch 2:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6800
Epoch 2 - Validation Loss: 0.5848


Training Epoch 3:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6722
Epoch 3 - Validation Loss: 0.5787


Training Epoch 4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6612
Epoch 4 - Validation Loss: 0.5734


Training Epoch 5:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6569
Epoch 5 - Validation Loss: 0.5688
Early stopping triggered after 5 epochs.

Training with Learning Rate: 5e-05 and Batch Size: 16


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6705
Epoch 1 - Validation Loss: 0.5837


Training Epoch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6645
Epoch 2 - Validation Loss: 0.5804


Training Epoch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6765
Epoch 3 - Validation Loss: 0.5774


Training Epoch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6679
Epoch 4 - Validation Loss: 0.5747


Training Epoch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6578
Epoch 5 - Validation Loss: 0.5723
Early stopping triggered after 5 epochs.

Training with Learning Rate: 0.0001 and Batch Size: 4


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6397
Epoch 1 - Validation Loss: 0.5531


Training Epoch 2:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6139
Epoch 2 - Validation Loss: 0.5317


Training Epoch 3:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.5903
Epoch 3 - Validation Loss: 0.5149
Saving new best model with validation loss: 0.5149


Training Epoch 4:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.5670
Epoch 4 - Validation Loss: 0.5040
Saving new best model with validation loss: 0.5040


Training Epoch 5:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.5526
Epoch 5 - Validation Loss: 0.4970
Saving new best model with validation loss: 0.4970


Training Epoch 6:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 6 - Training Loss: 0.5412
Epoch 6 - Validation Loss: 0.4920
Saving new best model with validation loss: 0.4920


Training Epoch 7:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 7 - Training Loss: 0.5348
Epoch 7 - Validation Loss: 0.4880
Saving new best model with validation loss: 0.4880


Training Epoch 8:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 8 - Training Loss: 0.5265
Epoch 8 - Validation Loss: 0.4846
Saving new best model with validation loss: 0.4846


Training Epoch 9:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 9 - Training Loss: 0.5192
Epoch 9 - Validation Loss: 0.4820
Saving new best model with validation loss: 0.4820


Training Epoch 10:   0%|          | 0/18 [00:00<?, ?it/s]

Epoch 10 - Training Loss: 0.5194
Epoch 10 - Validation Loss: 0.4801
Saving new best model with validation loss: 0.4801

Training with Learning Rate: 0.0001 and Batch Size: 8


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6574
Epoch 1 - Validation Loss: 0.5857


Training Epoch 2:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6403
Epoch 2 - Validation Loss: 0.5742


Training Epoch 3:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6259
Epoch 3 - Validation Loss: 0.5638


Training Epoch 4:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6146
Epoch 4 - Validation Loss: 0.5547


Training Epoch 5:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6056
Epoch 5 - Validation Loss: 0.5472
Early stopping triggered after 5 epochs.

Training with Learning Rate: 0.0001 and Batch Size: 16


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.6710
Epoch 1 - Validation Loss: 0.5896


Training Epoch 2:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2 - Training Loss: 0.6599
Epoch 2 - Validation Loss: 0.5836


Training Epoch 3:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3 - Training Loss: 0.6608
Epoch 3 - Validation Loss: 0.5781


Training Epoch 4:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4 - Training Loss: 0.6500
Epoch 4 - Validation Loss: 0.5731


Training Epoch 5:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5 - Training Loss: 0.6335
Epoch 5 - Validation Loss: 0.5687
Early stopping triggered after 5 epochs.

Training complete!
Best model saved at ./best_lora
Best Hyperparameters: {'learning_rate': 0.0001, 'batch_size': 4}
