In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import time
from transformers import BertModel, BertTokenizer
import torch
from torch.optim import AdamW
from transformers import BertForSequenceClassification
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from torch.nn import CrossEntropyLoss

In [None]:
train_df = pd.read_csv('/content/drive/My Drive/task3_data/twitter-2016train-A.tsv', sep='\t', header=None)
val_df = pd.read_csv('/content/drive/My Drive/task3_data/twitter-2016dev-A.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/My Drive/task3_data/twitter-2016test-A.tsv', sep='\t', header=None)

In [None]:
# Generating labels for the texts
def produce_label(raw_data, one_hot=False):
    label = []
    for item in list(raw_data[1]):
        if item == 'negative':
            label.append(0)
        elif item == 'neutral':
            label.append(1)
        elif item == 'positive':
            label.append(2)
    label = np.array(label)
    if one_hot:
        oh_label = np.zeros((len(label), 3))
        for (i, item) in enumerate(label):
            oh_label[i, item] = 1.
        label = oh_label
    return label

In [None]:
# Tokenize input and generate input ids
def produce_input(text_data, max_length=64):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encoded_batch = tokenizer(text_data, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    return encoded_batch['input_ids']

In [None]:
# Produce batch data for computing
def produce_batch_data(raw_data, batch_size=16, max_length=64, one_hot=False):
    num_length = len(raw_data)
    for n in range(0, num_length, batch_size):
        batch_data = raw_data.iloc[n:min(n+batch_size, num_length)]
        batch_label = produce_label(batch_data, one_hot)
        batch_input = [text for text in batch_data[2]]  # Assuming column 2 contains the text data
        yield batch_input, batch_label

In [None]:
# Confirms that the batch data can be sucessfully generated
for item in produce_batch_data(train_df):
    print(item)
    break

(["dear @Microsoft the newOoffice for Mac is great and all, but no Lync update? C'mon.", "@Microsoft how about you make a system that doesn't eat my friggin discs. This is the 2nd time this has happened and I am so sick of it!", "I may be ignorant on this issue but... should we celebrate @Microsoft's parental leave changes? Doesn't the gender divide suggest... (1/2)", 'Thanks to @microsoft, I just may be switching over to @apple.', 'If I make a game as a #windows10 Universal App. Will #xboxone owners be able to download and play it in November? @majornelson @Microsoft', 'Microsoft, I may not prefer your gaming branch of business. But, you do make a damn fine operating system. #Windows10 @Microsoft', '@MikeWolf1980 @Microsoft I will be downgrading and let #Windows10 be out for almost the 1st yr b4 trying it again. #Windows10fail', '@Microsoft 2nd computer with same error!!! #Windows10fail Guess we will shelve this until SP1! http://t.co/QCcHlKuy8Q', 'Just ordered my 1st ever tablet; @Mi

In [None]:
# Check cuda availablity
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: Tesla T4


In [None]:

# Function to convert the regression prediction back to a categorical prediction
def convert_to_categorical(predictions, threshold=0.5):
    # Assuming: 0 for 'negative', 1 for 'neutral', 2 for 'positive'
    categories = []
    for pred in predictions:
        if pred < 1 - threshold:
            categories.append(0)  # Negative
        elif pred > 1 + threshold:
            categories.append(2)  # Positive
        else:
            categories.append(1)  # Neutral
    return categories

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch.nn import CrossEntropyLoss
from transformers import BertForSequenceClassification, AdamW
from tqdm.auto import tqdm
import numpy as np

# Assuming the following functions are defined:
# produce_batch_data, produce_input

# Function to initialize the model
def initialize_model():
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    return model.to(device)

# Training and Validation Function
def train_and_validate(train_df, val_df, device, num_epochs=10, batch_size=16):
    model = initialize_model()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    loss_function = CrossEntropyLoss()

    for epoch in tqdm(range(num_epochs), desc='Epochs'):
        # Training
        model.train()
        total_loss = 0
        train_predictions, train_labels = [], []
        batch_iterator = produce_batch_data(train_df, batch_size=batch_size, max_length=64, one_hot=True)

        for batch_inputs, batch_labels in tqdm(batch_iterator, desc='Batches', leave=True):
            batch_input = produce_input(batch_inputs, max_length=64)
            batch_labels = torch.tensor(batch_labels).argmax(dim=1).long()

            batch_input = batch_input.to(device)
            batch_labels = batch_labels.to(device)

            model.zero_grad()

            outputs = model(batch_input)
            logits = outputs.logits
            loss = loss_function(logits, batch_labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            train_predictions.extend(preds.cpu().numpy())
            train_labels.extend(batch_labels.cpu().numpy())

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / (len(train_df) / batch_size)
        train_accuracy = accuracy_score(train_labels, train_predictions)
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_predictions, average='macro', zero_division=0)

        print(f"Training Epoch {epoch} - Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1-Score: {train_f1:.4f}")

        # Validation
        model.eval()
        val_predictions, val_labels = [], []
        val_total_loss = 0
        with torch.no_grad():
            val_batch_iterator = produce_batch_data(val_df, batch_size=batch_size, max_length=64, one_hot=True)
            for val_batch_inputs, val_batch_labels in tqdm(val_batch_iterator, desc='Batches', leave=True):
                val_batch_input = produce_input(val_batch_inputs, max_length=64)
                val_batch_labels = torch.tensor(val_batch_labels).argmax(dim=1).long()

                val_batch_input = val_batch_input.to(device)
                val_batch_labels = val_batch_labels.to(device)

                val_outputs = model(val_batch_input)
                val_logits = val_outputs.logits
                val_loss = loss_function(val_logits, val_batch_labels)
                val_total_loss += val_loss.item()

                _, val_preds = torch.max(val_logits, dim=1)
                val_predictions.extend(val_preds.cpu().numpy())
                val_labels.extend(val_batch_labels.cpu().numpy())

            avg_val_loss = val_total_loss / (len(val_df) / batch_size)
            val_accuracy = accuracy_score(val_labels, val_predictions)
            val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro', zero_division=0)

            print(f"Validation Epoch {epoch} - Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1-Score: {val_f1:.4f}")

    return model

# Testing Function
def test_model(model, test_data, batch_size=16, max_length=64):
    model.eval()
    test_predictions, test_labels = [], []
    with torch.no_grad():
        test_iterator = produce_batch_data(test_data, batch_size=batch_size, max_length=max_length, one_hot=True)
        for test_batch_inputs, test_batch_labels in tqdm(test_iterator, desc='Testing', leave=False):
            test_batch_input = produce_input(test_batch_inputs, max_length=max_length)
            test_batch_labels = torch.tensor(test_batch_labels).argmax(dim=1).long()

            test_batch_input = test_batch_input.to(device)
            test_batch_labels = test_batch_labels.to(device)

            test_outputs = model(test_batch_input)
            test_logits = test_outputs.logits
            _, test_predictions_batch = torch.max(test_logits, dim=1)

            test_predictions.extend(test_predictions_batch.cpu().numpy())
            test_labels.extend(test_batch_labels.cpu().numpy())

        test_accuracy = accuracy_score(test_labels, test_predictions)
        test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro', zero_division=0)

    return test_accuracy, test_precision, test_recall, test_f1




In [None]:
# Script to Run Multiple Times with Detailed Output
num_runs = 5
test_results_all_runs = []

for run in range(num_runs):
    print(f"Run {run + 1}/{num_runs}")
    model = initialize_model()  # Initialize a new model for each run
    trained_model = train_and_validate(train_df, val_df, device, num_epochs=10, batch_size=16)
    test_accuracy, test_precision, test_recall, test_f1 = test_model(trained_model, test_df, batch_size=16, max_length=64)
    test_results_all_runs.append((test_accuracy, test_precision, test_recall, test_f1))

    # Print results for the current run
    print(f"Results for Run {run + 1}:")
    print(f"  Test Accuracy: {test_accuracy:.4f}")
    print(f"  Test Precision: {test_precision:.4f}")
    print(f"  Test Recall: {test_recall:.4f}")
    print(f"  Test F1-Score: {test_f1:.4f}\n")

# Convert list to numpy array for easier calculation
test_results_all_runs = np.array(test_results_all_runs)

# Calculate mean and standard deviation across runs
mean_test_results = np.mean(test_results_all_runs, axis=0)
std_test_results = np.std(test_results_all_runs, axis=0)

# Print the averaged results
print("Averaged Test Results Across Runs:")
print(f"  Mean Test Accuracy: {mean_test_results[0]:.4f} (±{std_test_results[0]:.4f})")
print(f"  Mean Test Precision: {mean_test_results[1]:.4f} (±{std_test_results[1]:.4f})")
print(f"  Mean Test Recall: {mean_test_results[2]:.4f} (±{std_test_results[2]:.4f})")
print(f"  Mean Test F1-Score: {mean_test_results[3]:.4f} (±{std_test_results[3]:.4f})")

Run 1/5


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches: 0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Training Epoch 0 - Loss: 1.0104, Accuracy: 0.4893, Precision: 0.3702, Recall: 0.3422, F1-Score: 0.3053


Batches: 0it [00:00, ?it/s]

Validation Epoch 0 - Loss: 1.1287, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 1 - Loss: 0.9794, Accuracy: 0.5147, Precision: 0.4380, Recall: 0.3598, F1-Score: 0.3202


Batches: 0it [00:00, ?it/s]

Validation Epoch 1 - Loss: 0.9033, Accuracy: 0.5646, Precision: 0.5458, Recall: 0.4757, F1-Score: 0.4457


Batches: 0it [00:00, ?it/s]

Training Epoch 2 - Loss: 0.7794, Accuracy: 0.6297, Precision: 0.5754, Recall: 0.5468, F1-Score: 0.5561


Batches: 0it [00:00, ?it/s]

Validation Epoch 2 - Loss: 0.9176, Accuracy: 0.6022, Precision: 0.6096, Recall: 0.5361, F1-Score: 0.5376


Batches: 0it [00:00, ?it/s]

Training Epoch 3 - Loss: 0.5715, Accuracy: 0.7512, Precision: 0.7148, Recall: 0.7056, F1-Score: 0.7099


Batches: 0it [00:00, ?it/s]

Validation Epoch 3 - Loss: 1.1655, Accuracy: 0.6063, Precision: 0.6209, Recall: 0.5407, F1-Score: 0.5427


Batches: 0it [00:00, ?it/s]

Training Epoch 4 - Loss: 0.3285, Accuracy: 0.8715, Precision: 0.8570, Recall: 0.8533, F1-Score: 0.8549


Batches: 0it [00:00, ?it/s]

Validation Epoch 4 - Loss: 1.4070, Accuracy: 0.5992, Precision: 0.6170, Recall: 0.5306, F1-Score: 0.5307


Batches: 0it [00:00, ?it/s]

Training Epoch 5 - Loss: 0.1979, Accuracy: 0.9271, Precision: 0.9156, Recall: 0.9192, F1-Score: 0.9173


Batches: 0it [00:00, ?it/s]

Validation Epoch 5 - Loss: 1.4367, Accuracy: 0.6033, Precision: 0.6022, Recall: 0.5624, F1-Score: 0.5724


Batches: 0it [00:00, ?it/s]

Training Epoch 6 - Loss: 0.1475, Accuracy: 0.9489, Precision: 0.9390, Recall: 0.9389, F1-Score: 0.9389


Batches: 0it [00:00, ?it/s]

Validation Epoch 6 - Loss: 1.6339, Accuracy: 0.6089, Precision: 0.6109, Recall: 0.5572, F1-Score: 0.5658


Batches: 0it [00:00, ?it/s]

Training Epoch 7 - Loss: 0.1015, Accuracy: 0.9669, Precision: 0.9635, Recall: 0.9631, F1-Score: 0.9633


Batches: 0it [00:00, ?it/s]

Validation Epoch 7 - Loss: 1.6570, Accuracy: 0.6012, Precision: 0.5852, Recall: 0.5587, F1-Score: 0.5641


Batches: 0it [00:00, ?it/s]

Training Epoch 8 - Loss: 0.0545, Accuracy: 0.9823, Precision: 0.9801, Recall: 0.9791, F1-Score: 0.9796


Batches: 0it [00:00, ?it/s]

Validation Epoch 8 - Loss: 1.8144, Accuracy: 0.5966, Precision: 0.5792, Recall: 0.5700, F1-Score: 0.5734


Batches: 0it [00:00, ?it/s]

Training Epoch 9 - Loss: 0.0440, Accuracy: 0.9855, Precision: 0.9824, Recall: 0.9844, F1-Score: 0.9834


Batches: 0it [00:00, ?it/s]

Validation Epoch 9 - Loss: 1.8137, Accuracy: 0.6048, Precision: 0.5902, Recall: 0.5714, F1-Score: 0.5767


Testing: 0it [00:00, ?it/s]

Results for Run 1:
  Test Accuracy: 0.5818
  Test Precision: 0.6066
  Test Recall: 0.5900
  Test F1-Score: 0.5688

Run 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches: 0it [00:00, ?it/s]

Training Epoch 0 - Loss: 1.0092, Accuracy: 0.4888, Precision: 0.4184, Recall: 0.3485, F1-Score: 0.3202


Batches: 0it [00:00, ?it/s]

Validation Epoch 0 - Loss: 1.1089, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 1 - Loss: 1.0110, Accuracy: 0.4867, Precision: 0.2922, Recall: 0.3343, F1-Score: 0.2876


Batches: 0it [00:00, ?it/s]

Validation Epoch 1 - Loss: 1.1655, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 2 - Loss: 1.0131, Accuracy: 0.4922, Precision: 0.3438, Recall: 0.3416, F1-Score: 0.3005


Batches: 0it [00:00, ?it/s]

Validation Epoch 2 - Loss: 1.1361, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 3 - Loss: 1.0087, Accuracy: 0.4879, Precision: 0.2836, Recall: 0.3292, F1-Score: 0.2709


Batches: 0it [00:00, ?it/s]

Validation Epoch 3 - Loss: 1.1565, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 4 - Loss: 1.0082, Accuracy: 0.5049, Precision: 0.2941, Recall: 0.3351, F1-Score: 0.2600


Batches: 0it [00:00, ?it/s]

Validation Epoch 4 - Loss: 1.1235, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 5 - Loss: 1.0036, Accuracy: 0.5085, Precision: 0.2857, Recall: 0.3327, F1-Score: 0.2402


Batches: 0it [00:00, ?it/s]

Validation Epoch 5 - Loss: 1.1158, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 6 - Loss: 1.0018, Accuracy: 0.5126, Precision: 0.2898, Recall: 0.3332, F1-Score: 0.2309


Batches: 0it [00:00, ?it/s]

Validation Epoch 6 - Loss: 1.1078, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 7 - Loss: 1.0003, Accuracy: 0.5121, Precision: 0.2544, Recall: 0.3323, F1-Score: 0.2276


Batches: 0it [00:00, ?it/s]

Validation Epoch 7 - Loss: 1.1044, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 8 - Loss: 0.9997, Accuracy: 0.5131, Precision: 0.2083, Recall: 0.3327, F1-Score: 0.2264


Batches: 0it [00:00, ?it/s]

Validation Epoch 8 - Loss: 1.1031, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 9 - Loss: 0.9991, Accuracy: 0.5140, Precision: 0.2825, Recall: 0.3333, F1-Score: 0.2266


Batches: 0it [00:00, ?it/s]

Validation Epoch 9 - Loss: 1.1006, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Testing: 0it [00:00, ?it/s]

Results for Run 2:
  Test Accuracy: 0.3421
  Test Precision: 0.1140
  Test Recall: 0.3333
  Test F1-Score: 0.1699

Run 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches: 0it [00:00, ?it/s]

Training Epoch 0 - Loss: 0.9390, Accuracy: 0.5298, Precision: 0.4691, Recall: 0.4145, F1-Score: 0.4104


Batches: 0it [00:00, ?it/s]

Validation Epoch 0 - Loss: 0.9271, Accuracy: 0.5692, Precision: 0.5501, Recall: 0.5068, F1-Score: 0.4990


Batches: 0it [00:00, ?it/s]

Training Epoch 1 - Loss: 0.7365, Accuracy: 0.6651, Precision: 0.6257, Recall: 0.6038, F1-Score: 0.6128


Batches: 0it [00:00, ?it/s]

Validation Epoch 1 - Loss: 0.9360, Accuracy: 0.5936, Precision: 0.5896, Recall: 0.5299, F1-Score: 0.5306


Batches: 0it [00:00, ?it/s]

Training Epoch 2 - Loss: 0.5215, Accuracy: 0.7863, Precision: 0.7700, Recall: 0.7520, F1-Score: 0.7599


Batches: 0it [00:00, ?it/s]

Validation Epoch 2 - Loss: 1.1901, Accuracy: 0.6033, Precision: 0.6003, Recall: 0.5558, F1-Score: 0.5622


Batches: 0it [00:00, ?it/s]

Training Epoch 3 - Loss: 0.3383, Accuracy: 0.8701, Precision: 0.8601, Recall: 0.8518, F1-Score: 0.8556


Batches: 0it [00:00, ?it/s]

Validation Epoch 3 - Loss: 1.2167, Accuracy: 0.6058, Precision: 0.6196, Recall: 0.5549, F1-Score: 0.5645


Batches: 0it [00:00, ?it/s]

Training Epoch 4 - Loss: 0.2316, Accuracy: 0.9146, Precision: 0.9097, Recall: 0.9065, F1-Score: 0.9080


Batches: 0it [00:00, ?it/s]

Validation Epoch 4 - Loss: 1.1671, Accuracy: 0.6033, Precision: 0.5964, Recall: 0.5781, F1-Score: 0.5842


Batches: 0it [00:00, ?it/s]

Training Epoch 5 - Loss: 0.1662, Accuracy: 0.9455, Precision: 0.9429, Recall: 0.9389, F1-Score: 0.9408


Batches: 0it [00:00, ?it/s]

Validation Epoch 5 - Loss: 1.6172, Accuracy: 0.5895, Precision: 0.5720, Recall: 0.5609, F1-Score: 0.5576


Batches: 0it [00:00, ?it/s]

Training Epoch 6 - Loss: 0.0949, Accuracy: 0.9698, Precision: 0.9667, Recall: 0.9636, F1-Score: 0.9651


Batches: 0it [00:00, ?it/s]

Validation Epoch 6 - Loss: 1.8783, Accuracy: 0.5900, Precision: 0.5709, Recall: 0.5669, F1-Score: 0.5562


Batches: 0it [00:00, ?it/s]

Training Epoch 7 - Loss: 0.0641, Accuracy: 0.9789, Precision: 0.9765, Recall: 0.9742, F1-Score: 0.9753


Batches: 0it [00:00, ?it/s]

Validation Epoch 7 - Loss: 1.8831, Accuracy: 0.6043, Precision: 0.5868, Recall: 0.5882, F1-Score: 0.5840


Batches: 0it [00:00, ?it/s]

Training Epoch 8 - Loss: 0.0499, Accuracy: 0.9840, Precision: 0.9820, Recall: 0.9797, F1-Score: 0.9808


Batches: 0it [00:00, ?it/s]

Validation Epoch 8 - Loss: 1.9272, Accuracy: 0.6002, Precision: 0.5887, Recall: 0.5651, F1-Score: 0.5723


Batches: 0it [00:00, ?it/s]

Training Epoch 9 - Loss: 0.0399, Accuracy: 0.9862, Precision: 0.9848, Recall: 0.9837, F1-Score: 0.9843


Batches: 0it [00:00, ?it/s]

Validation Epoch 9 - Loss: 1.9359, Accuracy: 0.6119, Precision: 0.5954, Recall: 0.5894, F1-Score: 0.5917


Testing: 0it [00:00, ?it/s]

Results for Run 3:
  Test Accuracy: 0.6006
  Test Precision: 0.6110
  Test Recall: 0.6069
  Test F1-Score: 0.5882

Run 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches: 0it [00:00, ?it/s]

Training Epoch 0 - Loss: 1.0072, Accuracy: 0.4853, Precision: 0.3963, Recall: 0.3356, F1-Score: 0.2925


Batches: 0it [00:00, ?it/s]

Validation Epoch 0 - Loss: 1.1319, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 1 - Loss: 1.0069, Accuracy: 0.4956, Precision: 0.2868, Recall: 0.3322, F1-Score: 0.2680


Batches: 0it [00:00, ?it/s]

Validation Epoch 1 - Loss: 1.1290, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 2 - Loss: 0.8604, Accuracy: 0.5789, Precision: 0.5098, Recall: 0.4870, F1-Score: 0.4929


Batches: 0it [00:00, ?it/s]

Validation Epoch 2 - Loss: 0.9020, Accuracy: 0.5448, Precision: 0.6500, Recall: 0.4479, F1-Score: 0.3957


Batches: 0it [00:00, ?it/s]

Training Epoch 3 - Loss: 0.6733, Accuracy: 0.6992, Precision: 0.6588, Recall: 0.6434, F1-Score: 0.6502


Batches: 0it [00:00, ?it/s]

Validation Epoch 3 - Loss: 0.9131, Accuracy: 0.6058, Precision: 0.6405, Recall: 0.5382, F1-Score: 0.5404


Batches: 0it [00:00, ?it/s]

Training Epoch 4 - Loss: 0.4609, Accuracy: 0.8096, Precision: 0.7866, Recall: 0.7744, F1-Score: 0.7799


Batches: 0it [00:00, ?it/s]

Validation Epoch 4 - Loss: 1.1209, Accuracy: 0.5936, Precision: 0.6307, Recall: 0.5310, F1-Score: 0.5359


Batches: 0it [00:00, ?it/s]

Training Epoch 5 - Loss: 0.3007, Accuracy: 0.8875, Precision: 0.8701, Recall: 0.8669, F1-Score: 0.8685


Batches: 0it [00:00, ?it/s]

Validation Epoch 5 - Loss: 1.2501, Accuracy: 0.5987, Precision: 0.6120, Recall: 0.5632, F1-Score: 0.5737


Batches: 0it [00:00, ?it/s]

Training Epoch 6 - Loss: 0.2153, Accuracy: 0.9245, Precision: 0.9166, Recall: 0.9143, F1-Score: 0.9154


Batches: 0it [00:00, ?it/s]

Validation Epoch 6 - Loss: 1.5402, Accuracy: 0.5916, Precision: 0.5757, Recall: 0.5479, F1-Score: 0.5543


Batches: 0it [00:00, ?it/s]

Training Epoch 7 - Loss: 0.1304, Accuracy: 0.9557, Precision: 0.9505, Recall: 0.9473, F1-Score: 0.9489


Batches: 0it [00:00, ?it/s]

Validation Epoch 7 - Loss: 1.6629, Accuracy: 0.5936, Precision: 0.5775, Recall: 0.5633, F1-Score: 0.5648


Batches: 0it [00:00, ?it/s]

Training Epoch 8 - Loss: 0.0766, Accuracy: 0.9739, Precision: 0.9704, Recall: 0.9697, F1-Score: 0.9701


Batches: 0it [00:00, ?it/s]

Validation Epoch 8 - Loss: 1.7839, Accuracy: 0.5900, Precision: 0.5781, Recall: 0.5544, F1-Score: 0.5605


Batches: 0it [00:00, ?it/s]

Training Epoch 9 - Loss: 0.0666, Accuracy: 0.9770, Precision: 0.9733, Recall: 0.9732, F1-Score: 0.9733


Batches: 0it [00:00, ?it/s]

Validation Epoch 9 - Loss: 1.8462, Accuracy: 0.5926, Precision: 0.5841, Recall: 0.5568, F1-Score: 0.5578


Testing: 0it [00:00, ?it/s]

Results for Run 4:
  Test Accuracy: 0.5513
  Test Precision: 0.5972
  Test Recall: 0.5785
  Test F1-Score: 0.5406

Run 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches: 0it [00:00, ?it/s]

Training Epoch 0 - Loss: 1.0143, Accuracy: 0.4646, Precision: 0.3748, Recall: 0.3474, F1-Score: 0.3362


Batches: 0it [00:00, ?it/s]

Validation Epoch 0 - Loss: 1.1886, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 1 - Loss: 1.0187, Accuracy: 0.4782, Precision: 0.2992, Recall: 0.3302, F1-Score: 0.2874


Batches: 0it [00:00, ?it/s]

Validation Epoch 1 - Loss: 1.1540, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 2 - Loss: 1.0110, Accuracy: 0.4915, Precision: 0.2847, Recall: 0.3318, F1-Score: 0.2737


Batches: 0it [00:00, ?it/s]

Validation Epoch 2 - Loss: 1.1289, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 3 - Loss: 1.0122, Accuracy: 0.4916, Precision: 0.2827, Recall: 0.3302, F1-Score: 0.2680


Batches: 0it [00:00, ?it/s]

Validation Epoch 3 - Loss: 1.1258, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 4 - Loss: 1.0119, Accuracy: 0.4920, Precision: 0.2803, Recall: 0.3302, F1-Score: 0.2677


Batches: 0it [00:00, ?it/s]

Validation Epoch 4 - Loss: 1.1244, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 5 - Loss: 1.0123, Accuracy: 0.4928, Precision: 0.2810, Recall: 0.3297, F1-Score: 0.2641


Batches: 0it [00:00, ?it/s]

Validation Epoch 5 - Loss: 1.1236, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 6 - Loss: 1.0104, Accuracy: 0.4896, Precision: 0.2718, Recall: 0.3269, F1-Score: 0.2605


Batches: 0it [00:00, ?it/s]

Validation Epoch 6 - Loss: 1.1218, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 7 - Loss: 1.0168, Accuracy: 0.4908, Precision: 0.2784, Recall: 0.3288, F1-Score: 0.2648


Batches: 0it [00:00, ?it/s]

Validation Epoch 7 - Loss: 1.1207, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 8 - Loss: 1.0076, Accuracy: 0.4954, Precision: 0.2849, Recall: 0.3309, F1-Score: 0.2637


Batches: 0it [00:00, ?it/s]

Validation Epoch 8 - Loss: 1.1215, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Batches: 0it [00:00, ?it/s]

Training Epoch 9 - Loss: 1.0123, Accuracy: 0.4893, Precision: 0.2763, Recall: 0.3275, F1-Score: 0.2630


Batches: 0it [00:00, ?it/s]

Validation Epoch 9 - Loss: 1.1179, Accuracy: 0.4217, Precision: 0.1406, Recall: 0.3333, F1-Score: 0.1977


Testing: 0it [00:00, ?it/s]

Results for Run 5:
  Test Accuracy: 0.3421
  Test Precision: 0.1140
  Test Recall: 0.3333
  Test F1-Score: 0.1699

Averaged Test Results Across Runs:
  Mean Test Accuracy: 0.4836 (±0.1166)
  Mean Test Precision: 0.4086 (±0.2405)
  Mean Test Recall: 0.4884 (±0.1270)
  Mean Test F1-Score: 0.4075 (±0.1946)
