In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import json
from tqdm.auto import tqdm
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants
# models = [
#     'microsoft/deberta-v3-xsmall',
#     'microsoft/deberta-v3-small',
#     'microsoft/deberta-v3-large',
#     'microsoft/deberta-v3-base',
# ]

MODEL_NAME = 'cross-encoder/nli-deberta-v3-base'
# MODEL_NAME = 'cross-encoder/nli-deberta-v3-small'
# MODEL_NAME = 'microsoft/deberta-v3-base'
# MODEL_NAME = "sileod/deberta-v3-base-tasksource-nli"
# MODEL_NAME = "clagator/biobert_v1.1_pubmed_nli_sts"
# MODEL_NAME = "gsarti/scibert-nli"
# MODEL_NAME = "gsarti/biobert-nli"
# MODEL_NAME = "dmis-lab/biobert-base-cased-v1.2" BAD
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [3]:
def debug_tokenization(model_name, primary_premise, secondary_premise, statement, premise_to_use, max_premise_length, max_input_length):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # if premise_to_use == "Original_truncated":
    #     # Tokenize the primary premise
    #     primary_tokens = tokenizer.encode(primary_premise, add_special_tokens=False)
    #     individual_premise_length = max_premise_length if not secondary_premise else int(max_premise_length / 2)
    #     combined_tokens = primary_tokens[:individual_premise_length]

    #     if secondary_premise:
    #         # Tokenize the secondary premise
    #         secondary_tokens = tokenizer.encode(secondary_premise, add_special_tokens=False)
    #         # combined_tokens += [tokenizer.sep_token_id] + secondary_tokens[:individual_premise_length]
    #         combined_tokens += secondary_tokens[:individual_premise_length]
    # else:  # summarized_premise
    #     # Tokenize the summarized premise
    #     combined_tokens = tokenizer.encode(primary_premise, add_special_tokens=False)

    if premise_to_use == "Combined":
        # Tokenize the summarized premise
        combined_tokens = tokenizer.encode(primary_premise, add_special_tokens=False)
    else:
        # Tokenize the primary premise
        primary_tokens = tokenizer.encode(primary_premise, add_special_tokens=False)
        individual_premise_length = max_premise_length if not secondary_premise else int(max_premise_length / 2)
        combined_tokens = primary_tokens[:individual_premise_length]

        if secondary_premise:
            # Tokenize the secondary premise
            secondary_tokens = tokenizer.encode(secondary_premise, add_special_tokens=False)
            # combined_tokens += [tokenizer.sep_token_id] + secondary_tokens[:individual_premise_length]
            combined_tokens += secondary_tokens[:individual_premise_length]

    # Tokenize the statement
    statement_tokens = tokenizer.encode(statement, add_special_tokens=False)

    # Combine tokens for final input
    input_tokens = [tokenizer.cls_token_id] + combined_tokens + [tokenizer.sep_token_id] + statement_tokens

    # Truncate to max_input_length if necessary
    input_tokens = input_tokens[:max_input_length - 1] + [tokenizer.sep_token_id]

    # Convert token IDs back to text
    tokenized_text = tokenizer.decode(input_tokens)

    # Print tokenized output as text
    print("Tokenized Text:", tokenized_text)

# Example usage
debug_tokenization(
    model_name="microsoft/deberta-v3-xsmall",
    primary_premise="Primary premise text here."*int(15),
    secondary_premise=None,  # Use None if not applicable
    statement="Statement text here.",
    premise_to_use="Original_truncated",  # Or "summarized_premise"
    max_premise_length=256,
    max_input_length=512
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenized Text: [CLS] Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.Primary premise text here.[SEP] Statement text here.[SEP]


In [4]:
# 1. Data Preprocessing
class NliDataset(Dataset):
    def __init__(self, data, tokenizer, premise_combined, primary_premise_to_use, secondary_premise_to_use, max_premise_length=256, max_input_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.premise_combined = premise_combined
        self.primary_premise_to_use = primary_premise_to_use
        self.secondary_premise_to_use = secondary_premise_to_use
        self.max_premise_length = max_premise_length
        self.max_input_length = max_input_length
        self.keys = list(data.keys())  # Store the keys of the dictionary

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]  # Use the index to get the corresponding key
        item = self.data[key]  # Retrieve the item using the key

        # Tokenize the statement and add [SEP] token at the beginning
        statement_tokens = self.tokenizer.encode(item["Statement"], add_special_tokens=False)
        len_statement_tokens = len(statement_tokens)

        remaining_for_premise_length = max(self.max_input_length - len_statement_tokens - 10, self.max_premise_length)

        if self.premise_combined:
            # Tokenize and use the summarized premise
            combined_tokens = self.tokenizer.encode(item[self.primary_premise_to_use], add_special_tokens=False)
        else:
            # Tokenize the primary premise
            primary_tokens = self.tokenizer.encode(item[self.primary_premise_to_use], add_special_tokens=False)
            
            individual_premise_length = int(remaining_for_premise_length) if item["Type"] != "Comparison" else int(remaining_for_premise_length / 2)
            combined_tokens = primary_tokens[:individual_premise_length]

            if item["Type"] == "Comparison":
                # Tokenize the secondary premise
                secondary_tokens = self.tokenizer.encode(item[self.secondary_premise_to_use], add_special_tokens=False)
                combined_tokens += secondary_tokens[:individual_premise_length]



        

        # Combine tokens for final input
        input_tokens = [self.tokenizer.cls_token_id] + combined_tokens + [self.tokenizer.sep_token_id] + statement_tokens

        # Truncate to max_input_length if necessary
        input_tokens = input_tokens[:self.max_input_length - 1] + [self.tokenizer.sep_token_id]

        # Convert to PyTorch tensors
        input_ids = torch.tensor(input_tokens).unsqueeze(0)
        attention_mask = torch.tensor([1] * len(input_tokens)).unsqueeze(0)

        return {
            "input_ids": input_ids.flatten(),
            "attention_mask": attention_mask.flatten(),
            "labels": torch.tensor(item["Label"] == "Entailment", dtype=torch.long)
        }

def preprocess_data(tokenizer, data, premise_combined, primary_premise_to_use, secondary_premise_to_use):
    return NliDataset(data, tokenizer, premise_combined, primary_premise_to_use, secondary_premise_to_use)

# 2. Model Setup
def get_model(model_name, device='cpu'):
    # model = AutoModel.from_pretrained(model_name, num_labels=2)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
    model.to(device)
    return model

# 3. Training and Evaluation
def train(model, train_loader, optimizer, lr_scheduler, device):
    model.train()
    for batch in tqdm(train_loader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

def evaluate(model, data_loader, set_name, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc=f"Evaluating on {set_name}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(F.log_softmax(logits, dim=1), dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch["labels"].cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average="macro", zero_division=0)
    return accuracy, f1, predictions, true_labels

def save_predictions(data, predictions, file_path):
    # Assume predictions is a list of integers (0 or 1)
    # Convert predictions to the corresponding label strings
    label_predictions = ["Entailment" if pred == 1 else "Contradiction" for pred in predictions]

    # Make a copy of the data to avoid modifying the original data
    updated_data = {key: dict(value, prediction=label_predictions[i]) for i, (key, value) in enumerate(data.items())}

    # Save the updated data with predictions to a file
    with open(file_path, 'w') as file:
        json.dump(updated_data, file, indent=4)

def save_model_and_optimizer(model, optimizer, file_path):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, file_path)

### Load Data

In [5]:
def get_premises_text(gold, raw_data_location):
  for data in gold:
        section = gold[data]["Section_id"]
        primary_id = gold[data]["Primary_id"]
        with open(f'{raw_data_location}/{primary_id}.json') as f:
            primary = json.load(f)

        primary_premise = ' '.join(primary[section])
        gold[data]["Primary_premise"] = primary_premise

        if gold[data]["Type"] == 'Comparison':
            secondary_id = gold[data]["Secondary_id"]
            with open(f'{raw_data_location}/{secondary_id}.json') as f:
                secondary = json.load(f)
            secondary_premise = ' '.join(secondary[section])
            gold[data]["Secondary_premise"] = secondary_premise

with open('data\\raw\\train.json') as f:
    train_data = json.load(f)
with open('data\\raw\\dev.json') as f:
    dev_data = json.load(f)
with open('data\\raw\\test.json') as f:
    test_data = json.load(f)

get_premises_text(train_data, 'data\\raw\\CT')
get_premises_text(dev_data, 'data\\raw\\CT')
get_premises_text(test_data, 'data\\raw\\CT')

#### Add Zero-Shot Summarizations and Fine-Tuned Summarizations

In [6]:
def update_data_dict(new_data_dict, old_data_dict, new_key, key_to_use):
    for key in new_data_dict:
        if key_to_use in new_data_dict[key]:
            old_data_dict[key][new_key] = new_data_dict[key][key_to_use]

In [7]:
for fine_tuning_steps in [0, 2, 5, 7, 10]:
    with open(f'data\\raw\\summary_train_{fine_tuning_steps}.json') as f:
        train_data_summarized = json.load(f)

    with open(f'data\\raw\\summary_dev_{fine_tuning_steps}.json') as f:
        dev_data_summarized = json.load(f)

    with open(f'data\\raw\\summary_test_{fine_tuning_steps}.json') as f:
        test_data_summarized = json.load(f)

    update_data_dict(train_data_summarized, train_data, f"Summarized_Primary_premise_{fine_tuning_steps}", "Primary_Premise")
    update_data_dict(dev_data_summarized, dev_data, f"Summarized_Primary_premise_{fine_tuning_steps}", "Primary_Premise")
    update_data_dict(test_data_summarized, test_data, f"Summarized_Primary_premise_{fine_tuning_steps}", "Primary_Premise")

    update_data_dict(train_data_summarized, train_data, f"Summarized_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")
    update_data_dict(dev_data_summarized, dev_data, f"Summarized_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")
    update_data_dict(test_data_summarized, test_data, f"Summarized_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")

In [8]:
for fine_tuning_steps in [0, 2, 5, 7]:
    with open(f'data\\raw\\scifive_train_{fine_tuning_steps}.json') as f:
        train_data_summarized = json.load(f)

    with open(f'data\\raw\\scifive_dev_{fine_tuning_steps}.json') as f:
        dev_data_summarized = json.load(f)

    with open(f'data\\raw\\scifive_test_{fine_tuning_steps}.json') as f:
        test_data_summarized = json.load(f)

    update_data_dict(train_data_summarized, train_data, f"Scifive_Primary_premise_{fine_tuning_steps}", "Primary_Premise")
    update_data_dict(dev_data_summarized, dev_data, f"Scifive_Primary_premise_{fine_tuning_steps}", "Primary_Premise")
    update_data_dict(test_data_summarized, test_data, f"Scifive_Primary_premise_{fine_tuning_steps}", "Primary_Premise")

    update_data_dict(train_data_summarized, train_data, f"Scifive_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")
    update_data_dict(dev_data_summarized, dev_data, f"Scifive_Seconday_premise_{fine_tuning_steps}", "Secondary_Premise")
    update_data_dict(test_data_summarized, test_data, f"Scifive_Seconday_premise_{fine_tuning_steps}", "Secondary_Premise")

In [9]:
for fine_tuning_steps in [0]:
    with open(f'data\\raw\\combined_train_{fine_tuning_steps}.json') as f:
        train_data_summarized = json.load(f)

    with open(f'data\\raw\\combined_dev_{fine_tuning_steps}.json') as f:
        dev_data_summarized = json.load(f)

    with open(f'data\\raw\\combined_test_{fine_tuning_steps}.json') as f:
        test_data_summarized = json.load(f)

    update_data_dict(train_data_summarized, train_data, f"Combined_Primary_premise_{fine_tuning_steps}", "Primary_Premise")
    update_data_dict(dev_data_summarized, dev_data, f"Combined_Primary_premise_{fine_tuning_steps}", "Primary_Premise")
    update_data_dict(test_data_summarized, test_data, f"Combined_Primary_premise_{fine_tuning_steps}", "Primary_Premise")

    # update_data_dict(train_data_summarized, train_data, f"Summarized_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")
    # update_data_dict(dev_data_summarized, dev_data, f"Summarized_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")
    # update_data_dict(test_data_summarized, test_data, f"Summarized_Secondary_premise_{fine_tuning_steps}", "Secondary_Premise")

In [10]:
train_data[list(train_data.keys())[0]]

{'Type': 'Comparison',
 'Section_id': 'Intervention',
 'Primary_id': 'NCT01928186',
 'Secondary_id': 'NCT00684983',
 'Statement': 'All the primary trial participants do not receive any oral capecitabine, oral lapatinib ditosylate or cixutumumab IV, in conrast all the secondary trial subjects receive these.',
 'Label': 'Contradiction',
 'Primary_premise': 'INTERVENTION 1:    Diagnostic (FLT PET)   Patients with early stage, ER positive primary breast cancer undergo FLT PET scan at baseline and 1-6 weeks after the start of standard endocrine treatment. The surgery follows 1-7 days after the second FLT PET scan.   Tracer used in the FLT PET (positron emission tomography) scanning procedure: [F18] fluorothymidine.   Positron Emission Tomography: Undergo FLT PET   Laboratory Biomarker Analysis: Correlative studies - Ki67 staining of the tumor tissue in the biopsy and surgical specimen.',
 'Secondary_premise': 'INTERVENTION 1:    Arm A   Patients receive oral capecitabine twice daily on days

In [11]:
def train_eval_model(epochs, epochs_to_save, learning_rate, results_location, device, model_name):
    os.makedirs(results_location, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = preprocess_data(tokenizer, train_data, premise_combined, primary_premise_to_use, secondary_premise_to_use)
    dev_dataset = preprocess_data(tokenizer, dev_data, premise_combined, primary_premise_to_use, secondary_premise_to_use)
    test_dataset = preprocess_data(tokenizer, test_data, premise_combined, primary_premise_to_use, secondary_premise_to_use)

    # Initialize the data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

    # Create DataLoader with DataCollator
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

    model = get_model(model_name, device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    clasification_reports_dev, clasification_reports_test = {}, {}

    best_test_predictions, best_test_f1 = None, 0
    for epoch in tqdm(range(epochs), desc="Training Epochs"):
        train(model, train_loader, optimizer, lr_scheduler, device)
        if epoch in epochs_to_save:
            train_accuracy, train_f1, train_predictions, dev_labels = evaluate(model, train_loader, "Train", device)
            dev_accuracy, dev_f1, dev_predictions, dev_labels = evaluate(model, dev_loader, "Dev", device)
            test_accuracy, test_f1, test_predictions, test_labels = evaluate(model, test_loader, "Test", device)

            print(f"Epoch {epoch} - Train Accuracy: {train_accuracy:0.5f}, \t Train F1: {train_f1:0.5f}")
            print(f"Epoch {epoch} - Dev   Accuracy: {dev_accuracy:0.5f}, \t Dev   F1: {dev_f1:0.5f}")
            print(f"Epoch {epoch} - Test  Accuracy: {test_accuracy:0.5f}, \t Test  F1: {test_f1:0.5f}")
            clasification_reports_dev[epoch] = classification_report(dev_labels, dev_predictions, output_dict=True, target_names=['Contradiction', 'Entailment'], zero_division=0)
            clasification_reports_test[epoch] = classification_report(test_labels, test_predictions, output_dict=True, target_names=['Contradiction', 'Entailment'], zero_division=0)

            dev_save_path = os.path.join(results_location, f"dev_predictions_epoch{epoch}_{premise_prefix}.json")
            test_save_path = os.path.join(results_location, f"test_predictions_epoch{epoch}_{premise_prefix}.json")
            model_save_path = os.path.join(results_location, f"model_optimizer_epoch{epoch}_{premise_prefix}.pt")

            save_predictions(dev_data, dev_predictions, dev_save_path)
            save_predictions(test_data, test_predictions, test_save_path)
            save_model_and_optimizer(model, optimizer, model_save_path)

            if test_f1 > best_test_f1:
                best_test_f1 = test_f1
                best_test_predictions = test_predictions
    return best_test_f1, best_test_predictions
    

In [12]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Helper function to prepare data for vectorization
def prepare_data(data):
    combined_texts = []
    for key, value in data.items():
        premise = value['Primary_premise']
        if 'Secondary_premise' in value:
            premise += ' ' + value['Secondary_premise']
        combined_texts.append(premise)
    return combined_texts

# Prepare the train, dev, and test data
train_texts = prepare_data(train_data)
dev_texts = prepare_data(dev_data)
test_texts = prepare_data(test_data)

# Create and fit the TF-IDF vectorizer on the training data
vectorizer = TfidfVectorizer()
vectorizer.fit(train_texts)

# Helper function to summarize a text
def summarize_text(text, vectorizer, max_words=256):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) == 1:
        return sentences[0]

    tfidf_matrix = vectorizer.transform(sentences)
    sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

    num_sentences = int(len(sentences) * 0.3)
    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
    top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

    summary = ' '.join(top_sentences)
    summary_words = summary.split(' ')
    if len(summary_words) > max_words:
        summary = ' '.join(summary_words[:max_words])

    return summary

# Function to process and summarize data
def process_and_summarize(data, vectorizer):
    for key, value in data.items():
        premise = value['Primary_premise']
        if 'Secondary_premise' in value:
            premise += ' ' + value['Secondary_premise']

        summary = summarize_text(premise, vectorizer)
        data[key]['Extractive_Primary_premise'] = summary

# Process and summarize each dataset
process_and_summarize(train_data, vectorizer)
process_and_summarize(dev_data, vectorizer)
process_and_summarize(test_data, vectorizer)

In [13]:
with open(f'test_final.json', 'r') as f:
    test_data = json.load(f)

experiments_f1_dict = {'Pred_T5_Combined_0': 0.612,
                        'Pred_T5_Separate_0': 0.6368829049367606,
                        'Pred_T5_Separate_2': 0.6311501699916607,
                        'Pred_T5_Separate_5': 0.6152551339393064,
                        'Pred_T5_Separate_7': 0.5177701692626706,
                        'Pred_T5_Separate_10': 0.617961796179618,
                        'Pred_Truncated': 0.503923103529883,
                        'Pred_Extractive': 0.5458404074702886}

In [14]:
test_data[list(test_data.keys())[0]].keys()

dict_keys(['Type', 'Section_id', 'Primary_id', 'Secondary_id', 'Statement', 'Label', 'Primary_premise', 'Secondary_premise', 'Summarized_Primary_premise_0', 'Summarized_Secondary_premise_0', 'Summarized_Primary_premise_2', 'Summarized_Secondary_premise_2', 'Summarized_Primary_premise_5', 'Summarized_Secondary_premise_5', 'Summarized_Primary_premise_7', 'Summarized_Secondary_premise_7', 'Summarized_Primary_premise_10', 'Summarized_Secondary_premise_10', 'Scifive_Primary_premise_0', 'Scifive_Seconday_premise_0', 'Scifive_Primary_premise_2', 'Scifive_Seconday_premise_2', 'Scifive_Primary_premise_5', 'Scifive_Seconday_premise_5', 'Scifive_Primary_premise_7', 'Scifive_Seconday_premise_7', 'Combined_Primary_premise_0', 'Pred_T5_Combined_0', 'Pred_T5_Separate_0', 'Pred_T5_Separate_2', 'Pred_T5_Separate_5', 'Pred_T5_Separate_7', 'Pred_T5_Separate_10', 'Pred_Truncated', 'Extractive_Primary_premise', 'Pred_Extractive'])

In [15]:
# experiments_f1_dict = {}

for premise_prefix in ["Combined_", "Summarized_", "", "Extractive_", "Scifive_"]:
    for fine_tuning_steps_suffix in ["_0", "_2", "_5", "_7", "_10", ""]:
        if premise_prefix == "Combined_" and fine_tuning_steps_suffix != "_0":
            continue
        if premise_prefix == "Scifive_" and (fine_tuning_steps_suffix == "_10" or fine_tuning_steps_suffix == ""):
            continue
        if premise_prefix == "" and fine_tuning_steps_suffix != "":
            continue
        if premise_prefix == "Summarized_" and fine_tuning_steps_suffix == "":
            continue
        if premise_prefix == "Extractive_" and fine_tuning_steps_suffix != "":
            continue
        
        premise_combined = premise_prefix == "Combined_" or premise_prefix == "Extractive_" or premise_prefix == "Scifive_"
        label_name = "Pred_"
        if premise_prefix == "Combined_":
            label_name += f"T5_Combined{fine_tuning_steps_suffix}" 
        elif premise_prefix == "Scifive_":
            label_name += f"SciFive{fine_tuning_steps_suffix}"
        elif premise_prefix == "Summarized_":
            label_name += f"T5_Separate{fine_tuning_steps_suffix}"
        elif premise_prefix == "Extractive_":
            label_name += f"Extractive"
        else:
            label_name += f"Truncated"

        print(label_name)

        if label_name in experiments_f1_dict:
            continue

        primary_premise_to_use = premise_prefix + "Primary_premise" + fine_tuning_steps_suffix
        secondary_premise_to_use = premise_prefix + "Secondary_premise" + fine_tuning_steps_suffix

        epochs = 40 #if not(premise_combined) else 20
        epochs_to_save = np.arange(epochs)
        learning_rate = 4e-5 if not(premise_combined) else 5e-5
        batch_size = 16 if premise_prefix != "Scifive_" else 16
        if "Truncated" in label_name:
            batch_size = 12
            print(f"batch_size: {batch_size}")
            learning_rate = 5e-5
        if premise_prefix == "Summarized_":
            learning_rate = 5e-5
        results_location = "results"  # Replace with your location
        device = DEVICE
        model_name = MODEL_NAME

        best_test_f1, best_test_predictions = train_eval_model(epochs, epochs_to_save, learning_rate, results_location, device, model_name)

        experiments_f1_dict[label_name] = best_test_f1

        print(f"Best Test F1: {best_test_f1:0.5f}")
        for key_idx, key in enumerate(test_data):
            test_data[key][label_name] = "Entailment" if best_test_predictions[key_idx] else "Contradiction"
        
        with open(f'test_final.json', 'w') as f:
            json.dump(test_data, f)

Pred_T5_Combined_0
Pred_T5_Separate_0
Pred_T5_Separate_2
Pred_T5_Separate_5
Pred_T5_Separate_7
Pred_T5_Separate_10
Pred_Truncated
Pred_Extractive
Pred_SciFive_0


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at cross-encoder/nli-deberta-v3-base and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epochs:   0%|          | 0/40 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Training: 100%|██████████| 107/107 [00:20<00:00,  5.30it/s]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.40it/s]
Evaluating on Dev: 100%|██████████| 13/13 [

Epoch 0 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 0 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 0 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.52it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.65it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.65it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.75it/s]


Epoch 1 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 1 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 1 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.64it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.77it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.41it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.69it/s]


Epoch 2 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 2 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 2 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.57it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.52it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.47it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.76it/s]


Epoch 3 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 3 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 3 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.47it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.31it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.23it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.42it/s]


Epoch 4 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 4 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 4 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.50it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.40it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.31it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.55it/s]


Epoch 5 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 5 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 5 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.53it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.43it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.64it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.47it/s]


Epoch 6 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 6 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 6 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.57it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.65it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.68it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.86it/s]


Epoch 7 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 7 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 7 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.59it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.45it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.54it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.62it/s]


Epoch 8 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 8 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 8 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.60it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.40it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.71it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.65it/s]


Epoch 9 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 9 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 9 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.54it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.55it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.62it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.66it/s]


Epoch 10 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 10 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 10 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.57it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.54it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.49it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.66it/s]


Epoch 11 - Train Accuracy: 0.50059, 	 Train F1: 0.33464
Epoch 11 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 11 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.55it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.53it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.54it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.62it/s]


Epoch 12 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 12 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 12 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.63it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:07<00:00, 15.21it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.47it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.65it/s]


Epoch 13 - Train Accuracy: 0.50176, 	 Train F1: 0.33930
Epoch 13 - Dev   Accuracy: 0.50500, 	 Dev   F1: 0.35275
Epoch 13 - Test  Accuracy: 0.50600, 	 Test  F1: 0.34991


Training: 100%|██████████| 107/107 [00:19<00:00,  5.56it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.39it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.57it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 14.17it/s]


Epoch 14 - Train Accuracy: 0.50000, 	 Train F1: 0.38543
Epoch 14 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.35031
Epoch 14 - Test  Accuracy: 0.50000, 	 Test  F1: 0.38528


Training: 100%|██████████| 107/107 [00:19<00:00,  5.49it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.95it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.12it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.19it/s]


Epoch 15 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 15 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 15 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.77it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.90it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.98it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.12it/s]


Epoch 16 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 16 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 16 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.82it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.15it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.20it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.21it/s]


Epoch 17 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 17 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 17 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.75it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.93it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.29it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.19it/s]


Epoch 18 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 18 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 18 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.82it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.05it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.20it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.22it/s]


Epoch 19 - Train Accuracy: 0.50059, 	 Train F1: 0.33568
Epoch 19 - Dev   Accuracy: 0.50500, 	 Dev   F1: 0.34435
Epoch 19 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33686


Training: 100%|██████████| 107/107 [00:18<00:00,  5.78it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.77it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.01it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.19it/s]


Epoch 20 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 20 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 20 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.79it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.84it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.17it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.19it/s]


Epoch 21 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 21 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 21 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.70it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.91it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.15it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.21it/s]


Epoch 22 - Train Accuracy: 0.50412, 	 Train F1: 0.44685
Epoch 22 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.47207
Epoch 22 - Test  Accuracy: 0.51400, 	 Test  F1: 0.45132


Training: 100%|██████████| 107/107 [00:18<00:00,  5.76it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.84it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.95it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.08it/s]


Epoch 23 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 23 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 23 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.84it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.92it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.68it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.13it/s]


Epoch 24 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 24 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 24 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.83it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.74it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.06it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.16it/s]


Epoch 25 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 25 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 25 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.79it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.80it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.17it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.17it/s]


Epoch 26 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 26 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 26 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.73it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.08it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.17it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.13it/s]


Epoch 27 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 27 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 27 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.73it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.81it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.98it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.00it/s]


Epoch 28 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 28 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 28 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.76it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.97it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.95it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.16it/s]


Epoch 29 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 29 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 29 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.72it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.89it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.98it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.97it/s]


Epoch 30 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 30 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 30 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.71it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.71it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.15it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.04it/s]


Epoch 31 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 31 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 31 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.80it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.04it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.03it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.06it/s]


Epoch 32 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 32 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 32 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.76it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.92it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.84it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.02it/s]


Epoch 33 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 33 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 33 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.80it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.90it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.98it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.06it/s]


Epoch 34 - Train Accuracy: 0.50176, 	 Train F1: 0.33930
Epoch 34 - Dev   Accuracy: 0.50500, 	 Dev   F1: 0.36077
Epoch 34 - Test  Accuracy: 0.50400, 	 Test  F1: 0.34559


Training: 100%|██████████| 107/107 [00:18<00:00,  5.65it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.93it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.95it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.00it/s]


Epoch 35 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 35 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 35 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.73it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.80it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 19.06it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.00it/s]


Epoch 36 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 36 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 36 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:18<00:00,  5.73it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.95it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.98it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.71it/s]


Epoch 37 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 37 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 37 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.55it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.55it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.65it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.72it/s]


Epoch 38 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 38 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 38 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.54it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.63it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 18.65it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.02it/s]


Epoch 39 - Train Accuracy: 0.50000, 	 Train F1: 0.33333
Epoch 39 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 39 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training Epochs: 100%|██████████| 40/40 [20:57<00:00, 31.45s/it]


Best Test F1: 0.45132
Pred_SciFive_2


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at cross-encoder/nli-deberta-v3-base and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epochs:   0%|          | 0/40 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [17]:
experiments_f1_dict

{'Pred_T5_Combined_0': 0.612,
 'Pred_T5_Separate_0': 0.6368829049367606,
 'Pred_T5_Separate_2': 0.6311501699916607,
 'Pred_T5_Separate_5': 0.6152551339393064,
 'Pred_T5_Separate_7': 0.5177701692626706,
 'Pred_T5_Separate_10': 0.617961796179618,
 'Pred_Truncated': 0.503923103529883,
 'Pred_Extractive': 0.5458404074702886}

In [11]:
fine_tuning_steps_suffix = "_0"  # "_0" or "_2" or "_5" or "_7" or "_10" or ""
premise_prefix = "Combined_" # "Summarized_" or ""Scifive_" or "" or "Combined_"
premise_combined = premise_prefix == "Combined_"

epochs = 40
epochs_to_save = np.arange(epochs)
batch_size = 16
learning_rate = 4e-5
results_location = "results"  # Replace with your location
device = DEVICE
model_name = MODEL_NAME

primary_premise_to_use = premise_prefix + "Primary_premise" + fine_tuning_steps_suffix
secondary_premise_to_use = premise_prefix + "Secondary_premise" + fine_tuning_steps_suffix

In [12]:
os.makedirs(results_location, exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = preprocess_data(tokenizer, train_data, premise_combined, primary_premise_to_use, secondary_premise_to_use)
dev_dataset = preprocess_data(tokenizer, dev_data, premise_combined, primary_premise_to_use, secondary_premise_to_use)
test_dataset = preprocess_data(tokenizer, test_data, premise_combined, primary_premise_to_use, secondary_premise_to_use)

# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Create DataLoader with DataCollator
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

model = get_model(model_name, device)
optimizer = AdamW(model.parameters(), lr=learning_rate)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at cross-encoder/nli-deberta-v3-base and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
print(MODEL_NAME)

cross-encoder/nli-deberta-v3-base


In [14]:
num_training_steps = epochs * len(train_loader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
clasification_reports_dev, clasification_reports_test = {}, {}

for epoch in tqdm(range(epochs), desc="Training Epochs"):
    train(model, train_loader, optimizer, lr_scheduler, device)
    if epoch in epochs_to_save:
        train_accuracy, train_f1, train_predictions, dev_labels = evaluate(model, train_loader, "Train", device)
        dev_accuracy, dev_f1, dev_predictions, dev_labels = evaluate(model, dev_loader, "Dev", device)
        test_accuracy, test_f1, test_predictions, test_labels = evaluate(model, test_loader, "Test", device)

        print(f"Epoch {epoch} - Train Accuracy: {train_accuracy:0.5f}, \t Train F1: {train_f1:0.5f}")
        print(f"Epoch {epoch} - Dev   Accuracy: {dev_accuracy:0.5f}, \t Dev   F1: {dev_f1:0.5f}")
        print(f"Epoch {epoch} - Test  Accuracy: {test_accuracy:0.5f}, \t Test  F1: {test_f1:0.5f}")
        clasification_reports_dev[epoch] = classification_report(dev_labels, dev_predictions, output_dict=True, target_names=['Contradiction', 'Entailment'], zero_division=0)
        clasification_reports_test[epoch] = classification_report(test_labels, test_predictions, output_dict=True, target_names=['Contradiction', 'Entailment'], zero_division=0)

        dev_save_path = os.path.join(results_location, f"dev_predictions_epoch{epoch}_{premise_prefix}.json")
        test_save_path = os.path.join(results_location, f"test_predictions_epoch{epoch}_{premise_prefix}.json")
        model_save_path = os.path.join(results_location, f"model_optimizer_epoch{epoch}_{premise_prefix}.pt")

        save_predictions(dev_data, dev_predictions, dev_save_path)
        save_predictions(test_data, test_predictions, test_save_path)
        save_model_and_optimizer(model, optimizer, model_save_path)

Training Epochs:   0%|          | 0/40 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Training: 100%|██████████| 107/107 [00:19<00:00,  5.36it/s]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.30it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.15it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.92it/s]


Epoch 0 - Train Accuracy: 0.53471, 	 Train F1: 0.51705
Epoch 0 - Dev   Accuracy: 0.54000, 	 Dev   F1: 0.51431
Epoch 0 - Test  Accuracy: 0.55200, 	 Test  F1: 0.53092


Training: 100%|██████████| 107/107 [00:19<00:00,  5.58it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.38it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.41it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.90it/s]


Epoch 1 - Train Accuracy: 0.50412, 	 Train F1: 0.34644
Epoch 1 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 1 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.56it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.47it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.09it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.71it/s]


Epoch 2 - Train Accuracy: 0.50059, 	 Train F1: 0.33464
Epoch 2 - Dev   Accuracy: 0.50000, 	 Dev   F1: 0.33333
Epoch 2 - Test  Accuracy: 0.50000, 	 Test  F1: 0.33333


Training: 100%|██████████| 107/107 [00:19<00:00,  5.57it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.54it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.27it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.92it/s]


Epoch 3 - Train Accuracy: 0.56588, 	 Train F1: 0.52107
Epoch 3 - Dev   Accuracy: 0.56000, 	 Dev   F1: 0.50623
Epoch 3 - Test  Accuracy: 0.53600, 	 Test  F1: 0.48596


Training: 100%|██████████| 107/107 [00:19<00:00,  5.54it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:07<00:00, 15.24it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.29it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.83it/s]


Epoch 4 - Train Accuracy: 0.62294, 	 Train F1: 0.62002
Epoch 4 - Dev   Accuracy: 0.60500, 	 Dev   F1: 0.60060
Epoch 4 - Test  Accuracy: 0.61200, 	 Test  F1: 0.60742


Training: 100%|██████████| 107/107 [00:19<00:00,  5.56it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.70it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.23it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.83it/s]


Epoch 5 - Train Accuracy: 0.65471, 	 Train F1: 0.64810
Epoch 5 - Dev   Accuracy: 0.65000, 	 Dev   F1: 0.64489
Epoch 5 - Test  Accuracy: 0.59200, 	 Test  F1: 0.58910


Training: 100%|██████████| 107/107 [00:19<00:00,  5.57it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.33it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.07it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.93it/s]


Epoch 6 - Train Accuracy: 0.66235, 	 Train F1: 0.63691
Epoch 6 - Dev   Accuracy: 0.60000, 	 Dev   F1: 0.56597
Epoch 6 - Test  Accuracy: 0.58600, 	 Test  F1: 0.53811


Training: 100%|██████████| 107/107 [00:19<00:00,  5.61it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.50it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.48it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.90it/s]


Epoch 7 - Train Accuracy: 0.73941, 	 Train F1: 0.73636
Epoch 7 - Dev   Accuracy: 0.55500, 	 Dev   F1: 0.53923
Epoch 7 - Test  Accuracy: 0.57200, 	 Test  F1: 0.56487


Training: 100%|██████████| 107/107 [00:19<00:00,  5.47it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.45it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 15.99it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.62it/s]


Epoch 8 - Train Accuracy: 0.74706, 	 Train F1: 0.73859
Epoch 8 - Dev   Accuracy: 0.63000, 	 Dev   F1: 0.61118
Epoch 8 - Test  Accuracy: 0.58200, 	 Test  F1: 0.55219


Training: 100%|██████████| 107/107 [00:19<00:00,  5.60it/s]/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 15.35it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.11it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.65it/s]


Epoch 9 - Train Accuracy: 0.84176, 	 Train F1: 0.84176
Epoch 9 - Dev   Accuracy: 0.65500, 	 Dev   F1: 0.65492
Epoch 9 - Test  Accuracy: 0.59600, 	 Test  F1: 0.59313


Training: 100%|██████████| 107/107 [00:18<00:00,  5.82it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.12it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.04it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.66it/s]


Epoch 10 - Train Accuracy: 0.86294, 	 Train F1: 0.86291
Epoch 10 - Dev   Accuracy: 0.63000, 	 Dev   F1: 0.62996
Epoch 10 - Test  Accuracy: 0.62200, 	 Test  F1: 0.62014


Training: 100%|██████████| 107/107 [00:19<00:00,  5.63it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:07<00:00, 15.04it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 15.85it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:02<00:00, 15.61it/s]


Epoch 11 - Train Accuracy: 0.87529, 	 Train F1: 0.87504
Epoch 11 - Dev   Accuracy: 0.62000, 	 Dev   F1: 0.61966
Epoch 11 - Test  Accuracy: 0.60400, 	 Test  F1: 0.59782


Training: 100%|██████████| 107/107 [00:18<00:00,  5.92it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.37it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.08it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.74it/s]


Epoch 12 - Train Accuracy: 0.90000, 	 Train F1: 0.90000
Epoch 12 - Dev   Accuracy: 0.63000, 	 Dev   F1: 0.62996
Epoch 12 - Test  Accuracy: 0.61400, 	 Test  F1: 0.61231


Training: 100%|██████████| 107/107 [00:18<00:00,  5.93it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.30it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.11it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.72it/s]


Epoch 13 - Train Accuracy: 0.92000, 	 Train F1: 0.91979
Epoch 13 - Dev   Accuracy: 0.61500, 	 Dev   F1: 0.61071
Epoch 13 - Test  Accuracy: 0.60000, 	 Test  F1: 0.59742


Training: 100%|██████████| 107/107 [00:17<00:00,  6.06it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.75it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.69it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.21it/s]


Epoch 14 - Train Accuracy: 0.94706, 	 Train F1: 0.94706
Epoch 14 - Dev   Accuracy: 0.62000, 	 Dev   F1: 0.62000
Epoch 14 - Test  Accuracy: 0.59400, 	 Test  F1: 0.59263


Training: 100%|██████████| 107/107 [00:17<00:00,  6.04it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.92it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.57it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.29it/s]


Epoch 15 - Train Accuracy: 0.94353, 	 Train F1: 0.94352
Epoch 15 - Dev   Accuracy: 0.58500, 	 Dev   F1: 0.58449
Epoch 15 - Test  Accuracy: 0.59400, 	 Test  F1: 0.59068


Training: 100%|██████████| 107/107 [00:17<00:00,  6.06it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.78it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.71it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.28it/s]


Epoch 16 - Train Accuracy: 0.95588, 	 Train F1: 0.95588
Epoch 16 - Dev   Accuracy: 0.58000, 	 Dev   F1: 0.57895
Epoch 16 - Test  Accuracy: 0.59800, 	 Test  F1: 0.59624


Training: 100%|██████████| 107/107 [00:17<00:00,  6.13it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.76it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.66it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.21it/s]


Epoch 17 - Train Accuracy: 0.95824, 	 Train F1: 0.95823
Epoch 17 - Dev   Accuracy: 0.63500, 	 Dev   F1: 0.63499
Epoch 17 - Test  Accuracy: 0.58200, 	 Test  F1: 0.57994


Training: 100%|██████████| 107/107 [00:17<00:00,  6.11it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.78it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.64it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.30it/s]


Epoch 18 - Train Accuracy: 0.96824, 	 Train F1: 0.96823
Epoch 18 - Dev   Accuracy: 0.61000, 	 Dev   F1: 0.60984
Epoch 18 - Test  Accuracy: 0.60200, 	 Test  F1: 0.60100


Training: 100%|██████████| 107/107 [00:17<00:00,  6.05it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.63it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.59it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.05it/s]


Epoch 19 - Train Accuracy: 0.96824, 	 Train F1: 0.96822
Epoch 19 - Dev   Accuracy: 0.61000, 	 Dev   F1: 0.60606
Epoch 19 - Test  Accuracy: 0.61000, 	 Test  F1: 0.59895


Training: 100%|██████████| 107/107 [00:17<00:00,  6.09it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.19it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.29it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.82it/s]


Epoch 20 - Train Accuracy: 0.97824, 	 Train F1: 0.97823
Epoch 20 - Dev   Accuracy: 0.62500, 	 Dev   F1: 0.62492
Epoch 20 - Test  Accuracy: 0.61400, 	 Test  F1: 0.61344


Training: 100%|██████████| 107/107 [00:17<00:00,  5.96it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.42it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.97it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.75it/s]


Epoch 21 - Train Accuracy: 0.97706, 	 Train F1: 0.97705
Epoch 21 - Dev   Accuracy: 0.64500, 	 Dev   F1: 0.64492
Epoch 21 - Test  Accuracy: 0.60400, 	 Test  F1: 0.60194


Training: 100%|██████████| 107/107 [00:18<00:00,  5.91it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.03it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 16.37it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.78it/s]


Epoch 22 - Train Accuracy: 0.98294, 	 Train F1: 0.98294
Epoch 22 - Dev   Accuracy: 0.64500, 	 Dev   F1: 0.64299
Epoch 22 - Test  Accuracy: 0.59600, 	 Test  F1: 0.59584


Training: 100%|██████████| 107/107 [00:18<00:00,  5.90it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.44it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.13it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.91it/s]


Epoch 23 - Train Accuracy: 0.98471, 	 Train F1: 0.98471
Epoch 23 - Dev   Accuracy: 0.61500, 	 Dev   F1: 0.61491
Epoch 23 - Test  Accuracy: 0.60400, 	 Test  F1: 0.60119


Training: 100%|██████████| 107/107 [00:18<00:00,  5.92it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.65it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.15it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.75it/s]


Epoch 24 - Train Accuracy: 0.98412, 	 Train F1: 0.98412
Epoch 24 - Dev   Accuracy: 0.58000, 	 Dev   F1: 0.57962
Epoch 24 - Test  Accuracy: 0.62800, 	 Test  F1: 0.62482


Training: 100%|██████████| 107/107 [00:18<00:00,  5.94it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.22it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.17it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.77it/s]


Epoch 25 - Train Accuracy: 0.98765, 	 Train F1: 0.98765
Epoch 25 - Dev   Accuracy: 0.59500, 	 Dev   F1: 0.59450
Epoch 25 - Test  Accuracy: 0.62200, 	 Test  F1: 0.62145


Training: 100%|██████████| 107/107 [00:18<00:00,  5.91it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.63it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.15it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.72it/s]


Epoch 26 - Train Accuracy: 0.98529, 	 Train F1: 0.98529
Epoch 26 - Dev   Accuracy: 0.61000, 	 Dev   F1: 0.60996
Epoch 26 - Test  Accuracy: 0.62600, 	 Test  F1: 0.62456


Training: 100%|██████████| 107/107 [00:18<00:00,  5.86it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.48it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.15it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 16.86it/s]


Epoch 27 - Train Accuracy: 0.98882, 	 Train F1: 0.98882
Epoch 27 - Dev   Accuracy: 0.60500, 	 Dev   F1: 0.60452
Epoch 27 - Test  Accuracy: 0.62400, 	 Test  F1: 0.62158


Training: 100%|██████████| 107/107 [00:17<00:00,  5.98it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.88it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.45it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.03it/s]


Epoch 28 - Train Accuracy: 0.98765, 	 Train F1: 0.98765
Epoch 28 - Dev   Accuracy: 0.60000, 	 Dev   F1: 0.59900
Epoch 28 - Test  Accuracy: 0.61400, 	 Test  F1: 0.61303


Training: 100%|██████████| 107/107 [00:17<00:00,  6.13it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.42it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.57it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.05it/s]


Epoch 29 - Train Accuracy: 0.98941, 	 Train F1: 0.98941
Epoch 29 - Dev   Accuracy: 0.63500, 	 Dev   F1: 0.63294
Epoch 29 - Test  Accuracy: 0.61400, 	 Test  F1: 0.61387


Training: 100%|██████████| 107/107 [00:17<00:00,  6.04it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.43it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.45it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.06it/s]


Epoch 30 - Train Accuracy: 0.98765, 	 Train F1: 0.98765
Epoch 30 - Dev   Accuracy: 0.61000, 	 Dev   F1: 0.60902
Epoch 30 - Test  Accuracy: 0.60800, 	 Test  F1: 0.60147


Training: 100%|██████████| 107/107 [00:17<00:00,  6.11it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.78it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.57it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.16it/s]


Epoch 31 - Train Accuracy: 0.99118, 	 Train F1: 0.99118
Epoch 31 - Dev   Accuracy: 0.58500, 	 Dev   F1: 0.58474
Epoch 31 - Test  Accuracy: 0.62000, 	 Test  F1: 0.61802


Training: 100%|██████████| 107/107 [00:17<00:00,  6.07it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.69it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.69it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.17it/s]


Epoch 32 - Train Accuracy: 0.99059, 	 Train F1: 0.99059
Epoch 32 - Dev   Accuracy: 0.60000, 	 Dev   F1: 0.59964
Epoch 32 - Test  Accuracy: 0.61400, 	 Test  F1: 0.61270


Training: 100%|██████████| 107/107 [00:17<00:00,  6.10it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.75it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.66it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.15it/s]


Epoch 33 - Train Accuracy: 0.99176, 	 Train F1: 0.99176
Epoch 33 - Dev   Accuracy: 0.60000, 	 Dev   F1: 0.59964
Epoch 33 - Test  Accuracy: 0.61600, 	 Test  F1: 0.61461


Training: 100%|██████████| 107/107 [00:17<00:00,  6.11it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.67it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.50it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.09it/s]


Epoch 34 - Train Accuracy: 0.99235, 	 Train F1: 0.99235
Epoch 34 - Dev   Accuracy: 0.61000, 	 Dev   F1: 0.60902
Epoch 34 - Test  Accuracy: 0.61800, 	 Test  F1: 0.61732


Training: 100%|██████████| 107/107 [00:17<00:00,  6.23it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.71it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.57it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.12it/s]


Epoch 35 - Train Accuracy: 0.99118, 	 Train F1: 0.99118
Epoch 35 - Dev   Accuracy: 0.60000, 	 Dev   F1: 0.59936
Epoch 35 - Test  Accuracy: 0.62000, 	 Test  F1: 0.61880


Training: 100%|██████████| 107/107 [00:17<00:00,  5.98it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.72it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.66it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.17it/s]


Epoch 36 - Train Accuracy: 0.99176, 	 Train F1: 0.99176
Epoch 36 - Dev   Accuracy: 0.60500, 	 Dev   F1: 0.60420
Epoch 36 - Test  Accuracy: 0.61600, 	 Test  F1: 0.61442


Training: 100%|██████████| 107/107 [00:17<00:00,  5.96it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.90it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.62it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.23it/s]


Epoch 37 - Train Accuracy: 0.99118, 	 Train F1: 0.99118
Epoch 37 - Dev   Accuracy: 0.60500, 	 Dev   F1: 0.60380
Epoch 37 - Test  Accuracy: 0.61800, 	 Test  F1: 0.61745


Training: 100%|██████████| 107/107 [00:17<00:00,  5.99it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.73it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.50it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.17it/s]


Epoch 38 - Train Accuracy: 0.99176, 	 Train F1: 0.99176
Epoch 38 - Dev   Accuracy: 0.60500, 	 Dev   F1: 0.60380
Epoch 38 - Test  Accuracy: 0.61600, 	 Test  F1: 0.61526


Training: 100%|██████████| 107/107 [00:17<00:00,  6.08it/s]s/it]
Evaluating on Train: 100%|██████████| 107/107 [00:06<00:00, 16.72it/s]
Evaluating on Dev: 100%|██████████| 13/13 [00:00<00:00, 17.76it/s]
Evaluating on Test: 100%|██████████| 32/32 [00:01<00:00, 17.20it/s]


Epoch 39 - Train Accuracy: 0.99176, 	 Train F1: 0.99176
Epoch 39 - Dev   Accuracy: 0.60000, 	 Dev   F1: 0.59855
Epoch 39 - Test  Accuracy: 0.61800, 	 Test  F1: 0.61732


Training Epochs: 100%|██████████| 40/40 [20:17<00:00, 30.44s/it]


In [15]:
print("Classification Reports for Test Set")
for  epoch in clasification_reports_test.keys():
    print(clasification_reports_test[epoch])
    print()

Classification Reports for Test Set
{'Contradiction': {'precision': 0.5365168539325843, 'recall': 0.764, 'f1-score': 0.6303630363036304, 'support': 250}, 'Entailment': {'precision': 0.5902777777777778, 'recall': 0.34, 'f1-score': 0.43147208121827413, 'support': 250}, 'accuracy': 0.552, 'macro avg': {'precision': 0.563397315855181, 'recall': 0.552, 'f1-score': 0.5309175587609523, 'support': 500}, 'weighted avg': {'precision': 0.563397315855181, 'recall': 0.552, 'f1-score': 0.5309175587609523, 'support': 500}}

{'Contradiction': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 250}, 'Entailment': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 250}, 'accuracy': 0.5, 'macro avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 500}, 'weighted avg': {'precision': 0.25, 'recall': 0.5, 'f1-score': 0.3333333333333333, 'support': 500}}

{'Contradiction': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 250}, '