In [2]:
import pickle
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

#### Dataset and Dataloaders

In [22]:
labels = pickle.load(open("New Train Embeddings/train_labels.pkl", "rb"))
labels = labels[:156]

In [5]:
nmt_embeds = pickle.load(open("New Train Embeddings/nmt_embeddings.pkl", "rb"))
adapter_embeds = pickle.load(open("New Train Embeddings/adapter_embeddings.pkl", "rb"))

In [6]:
nmt_embeds_concat = torch.cat(nmt_embeds, dim=0)
adapter_embeds_concat = torch.cat(adapter_embeds, dim=0)

In [24]:
labels_concat = torch.cat(labels, dim=0)

In [25]:
len(labels_concat), len(nmt_embeds_concat), len(adapter_embeds_concat)

(4992, 4992, 4992)

In [26]:
class MyDataset(Dataset):
    def __init__(self, adapter_embeds, nmt_embeds, labels):
        self.adapter_embeds = adapter_embeds
        self.nmt_embeds = nmt_embeds
        self.labels = labels


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.adapter_embeds[idx], self.nmt_embeds[idx], self.labels[idx]

In [28]:
dataset = MyDataset(adapter_embeds_concat, nmt_embeds_concat, labels_concat)

In [29]:
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)])

In [30]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

#### Co-Attention Model

In [38]:
class CoAttentionModel(nn.Module):
    def __init__(self, embed_dim, num_labels):
        super(CoAttentionModel, self).__init__()
        self.embed_dim = embed_dim
        self.num_labels = num_labels

        self.W_b = nn.Parameter(torch.Tensor(embed_dim, embed_dim))
        nn.init.xavier_uniform_(self.W_b)

        self.transform1 = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.transform2 = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_labels)
        )

        self.batch_norm = nn.BatchNorm1d(embed_dim)

    def forward(self, x1, x2):

        x1_transformed = self.transform1(x1)
        x2_transformed = self.transform2(x2)

        affinity = torch.matmul(x1_transformed, self.W_b)
        affinity = torch.matmul(affinity, x2_transformed.transpose(1, 2))

        attention_weights1 = F.softmax(affinity, dim=2)
        attention_weights2 = F.softmax(affinity.transpose(1, 2), dim=2)

        attended_features1 = torch.matmul(attention_weights1, x2_transformed)
        attended_features2 = torch.matmul(attention_weights2, x1_transformed)
        
        attended_features = attended_features1 + attended_features2
        attended_features = self.batch_norm(attended_features.permute(0, 2, 1)).permute(0, 2, 1)
        attended_features = attended_features.mean(dim=1)

        logits = self.classifier(attended_features)
        
        return logits


In [39]:
loss_function = nn.BCEWithLogitsLoss()
model = CoAttentionModel(embed_dim=768, num_labels=21)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

train_losses = []
val_losses = []
train_macro_f1s = []
val_macro_f1s = []
train_weighted_f1s = []
val_weighted_f1s = []
train_accs = []
val_accs = []

for epoch in range(10):
    model.train()
    train_loss = 0
    train_preds = []
    train_labels = []
    for adapter_embeds, nmt_embeds, labels in tqdm(train_dataloader, desc=f'Training {epoch + 1}'):
        optimizer.zero_grad()
        logits = model(adapter_embeds, nmt_embeds)
        loss = loss_function(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_preds.extend(torch.sigmoid(logits).detach().numpy())
        train_labels.extend(labels.detach().numpy())
    train_loss /= len(train_dataloader)
    train_losses.append(train_loss)
    train_preds = np.array(train_preds) > 0.5
    train_labels = np.array(train_labels)
    train_macro_f1 = f1_score(train_labels, train_preds, average='macro')
    train_weighted_f1 = f1_score(train_labels, train_preds, average='weighted')
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_macro_f1s.append(train_macro_f1)
    train_weighted_f1s.append(train_weighted_f1)
    train_accs.append(train_accuracy)
    
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    for adapter_embeds, nmt_embeds, labels in tqdm(val_dataloader, desc=f'Validation {epoch + 1}'):
        with torch.no_grad():
            logits = model(adapter_embeds, nmt_embeds)
            loss = loss_function(logits, labels)
            val_loss += loss.item()
            val_preds.extend(torch.sigmoid(logits).detach().numpy())
            val_labels.extend(labels.detach().numpy())
    val_loss /= len(val_dataloader)
    val_losses.append(val_loss)
    val_preds = np.array(val_preds) > 0.5
    val_labels = np.array(val_labels)
    val_macro_f1 = f1_score(val_labels, val_preds, average='macro')
    val_weighted_f1 = f1_score(val_labels, val_preds, average='weighted')
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_macro_f1s.append(val_macro_f1)
    val_weighted_f1s.append(val_weighted_f1)
    val_accs.append(val_accuracy)
    
    print(f"Epoch {epoch + 1}:")
    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}")
    print(f"Train Macro F1: {train_macro_f1:.4f}, Val Macro F1: {val_macro_f1:.4f}")
    print(f"Train Weighted F1: {train_weighted_f1:.4f}, Val Weighted F1: {val_weighted_f1:.4f}")

Training 1:   0%|          | 0/125 [00:00<?, ?it/s]

Training 1: 100%|██████████| 125/125 [00:31<00:00,  3.91it/s]
Validation 1: 100%|██████████| 32/32 [00:02<00:00, 13.25it/s]


Epoch 1:
Train Loss: 0.2967, Val Loss: 0.1788
Train Accuracy: 0.2489, Val Accuracy: 0.3213
Train Macro F1: 0.3912, Val Macro F1: 0.4730
Train Weighted F1: 0.6623, Val Weighted F1: 0.7308


Training 2: 100%|██████████| 125/125 [00:32<00:00,  3.83it/s]
Validation 2: 100%|██████████| 32/32 [00:02<00:00, 12.59it/s]


Epoch 2:
Train Loss: 0.1729, Val Loss: 0.1604
Train Accuracy: 0.3321, Val Accuracy: 0.3273
Train Macro F1: 0.4985, Val Macro F1: 0.5247
Train Weighted F1: 0.7443, Val Weighted F1: 0.7598


Training 3: 100%|██████████| 125/125 [00:32<00:00,  3.79it/s]
Validation 3: 100%|██████████| 32/32 [00:02<00:00, 11.92it/s]


Epoch 3:
Train Loss: 0.1589, Val Loss: 0.1556
Train Accuracy: 0.3514, Val Accuracy: 0.3463
Train Macro F1: 0.5522, Val Macro F1: 0.5640
Train Weighted F1: 0.7730, Val Weighted F1: 0.7728


Training 4: 100%|██████████| 125/125 [00:33<00:00,  3.71it/s]
Validation 4: 100%|██████████| 32/32 [00:02<00:00, 10.96it/s]


Epoch 4:
Train Loss: 0.1519, Val Loss: 0.1522
Train Accuracy: 0.3641, Val Accuracy: 0.3443
Train Macro F1: 0.5840, Val Macro F1: 0.5727
Train Weighted F1: 0.7865, Val Weighted F1: 0.7767


Training 5: 100%|██████████| 125/125 [00:35<00:00,  3.54it/s]
Validation 5: 100%|██████████| 32/32 [00:02<00:00, 10.88it/s]


Epoch 5:
Train Loss: 0.1464, Val Loss: 0.1505
Train Accuracy: 0.3694, Val Accuracy: 0.3544
Train Macro F1: 0.5940, Val Macro F1: 0.5953
Train Weighted F1: 0.7941, Val Weighted F1: 0.7815


Training 6: 100%|██████████| 125/125 [00:33<00:00,  3.69it/s]
Validation 6: 100%|██████████| 32/32 [00:02<00:00, 12.68it/s]


Epoch 6:
Train Loss: 0.1419, Val Loss: 0.1531
Train Accuracy: 0.3862, Val Accuracy: 0.3594
Train Macro F1: 0.6258, Val Macro F1: 0.6163
Train Weighted F1: 0.8042, Val Weighted F1: 0.7876


Training 7: 100%|██████████| 125/125 [00:32<00:00,  3.90it/s]
Validation 7: 100%|██████████| 32/32 [00:02<00:00, 12.91it/s]


Epoch 7:
Train Loss: 0.1364, Val Loss: 0.1491
Train Accuracy: 0.3967, Val Accuracy: 0.3614
Train Macro F1: 0.6381, Val Macro F1: 0.6194
Train Weighted F1: 0.8122, Val Weighted F1: 0.7860


Training 8: 100%|██████████| 125/125 [00:31<00:00,  3.94it/s]
Validation 8: 100%|██████████| 32/32 [00:02<00:00, 12.12it/s]


Epoch 8:
Train Loss: 0.1328, Val Loss: 0.1542
Train Accuracy: 0.4017, Val Accuracy: 0.3393
Train Macro F1: 0.6490, Val Macro F1: 0.6247
Train Weighted F1: 0.8163, Val Weighted F1: 0.7883


Training 9: 100%|██████████| 125/125 [00:31<00:00,  3.94it/s]
Validation 9: 100%|██████████| 32/32 [00:02<00:00, 12.42it/s]


Epoch 9:
Train Loss: 0.1294, Val Loss: 0.1472
Train Accuracy: 0.4110, Val Accuracy: 0.3333
Train Macro F1: 0.6570, Val Macro F1: 0.6411
Train Weighted F1: 0.8232, Val Weighted F1: 0.7893


Training 10: 100%|██████████| 125/125 [00:32<00:00,  3.90it/s]
Validation 10: 100%|██████████| 32/32 [00:02<00:00, 12.45it/s]

Epoch 10:
Train Loss: 0.1253, Val Loss: 0.1510
Train Accuracy: 0.4145, Val Accuracy: 0.3413
Train Macro F1: 0.6781, Val Macro F1: 0.6165
Train Weighted F1: 0.8298, Val Weighted F1: 0.7864





In [40]:
torch.save(model, "Co Attention Model New/CoAttentionModel.pt")
pickle.dump({
    "train_losses": train_losses,
    "val_losses": val_losses,
    "train_macro_f1s": train_macro_f1s,
    "val_macro_f1s": val_macro_f1s,
    "train_weighted_f1s": train_weighted_f1s,
    "val_weighted_f1s": val_weighted_f1s,
    "train_accs": train_accs,
    "val_accs": val_accs
}, open("Co Attention Model New/CoAttentionModelMetrics.pkl", "wb"))

#### Cross Attention Model

In [43]:
class CrossAttentionModel(nn.Module):

    def __init__(self, embed_dim, num_labels):
        super(CrossAttentionModel, self).__init__()
        self.embed_dim = embed_dim
        self.num_labels = num_labels

        self.attention1 = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True, dropout=0.1)
        self.attention2 = nn.MultiheadAttention(embed_dim, num_heads=8, batch_first=True, dropout=0.1)

        self.transform1 = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.transform2 = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_labels)
        )

        self.batch_norm = nn.BatchNorm1d(embed_dim)

    def forward(self, x1, x2):
            
        x1_transformed = self.transform1(x1)
        x2_transformed = self.transform2(x2)

        attended_features1, _ = self.attention1(x1_transformed, x2_transformed, x2_transformed)
        attended_features2, _ = self.attention2(x2_transformed, x1_transformed, x1_transformed)
        
        attended_features = attended_features1 + attended_features2
        attended_features = self.batch_norm(attended_features.permute(0, 2, 1)).permute(0, 2, 1)
        attended_features = attended_features.mean(dim=1)

        logits = self.classifier(attended_features)
        
        return logits

In [44]:
loss_function = nn.BCEWithLogitsLoss()
model = CrossAttentionModel(embed_dim=768, num_labels=21)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

train_losses = []
val_losses = []
train_macro_f1s = []
val_macro_f1s = []
train_weighted_f1s = []
val_weighted_f1s = []
train_accs = []
val_accs = []

for epoch in range(10):
    model.train()
    train_loss = 0
    train_preds = []
    train_labels = []
    for adapter_embeds, nmt_embeds, labels in tqdm(train_dataloader, desc=f'Training {epoch + 1}'):
        optimizer.zero_grad()
        logits = model(adapter_embeds, nmt_embeds)
        loss = loss_function(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_preds.extend(torch.sigmoid(logits).detach().numpy())
        train_labels.extend(labels.detach().numpy())
    train_loss /= len(train_dataloader)
    train_losses.append(train_loss)
    train_preds = np.array(train_preds) > 0.5
    train_labels = np.array(train_labels)
    train_macro_f1 = f1_score(train_labels, train_preds, average='macro')
    train_weighted_f1 = f1_score(train_labels, train_preds, average='weighted')
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_macro_f1s.append(train_macro_f1)
    train_weighted_f1s.append(train_weighted_f1)
    train_accs.append(train_accuracy)
    
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    for adapter_embeds, nmt_embeds, labels in tqdm(val_dataloader, desc=f'Validation {epoch + 1}'):
        with torch.no_grad():
            logits = model(adapter_embeds, nmt_embeds)
            loss = loss_function(logits, labels)
            val_loss += loss.item()
            val_preds.extend(torch.sigmoid(logits).detach().numpy())
            val_labels.extend(labels.detach().numpy())
    val_loss /= len(val_dataloader)
    val_losses.append(val_loss)
    val_preds = np.array(val_preds) > 0.5
    val_labels = np.array(val_labels)
    val_macro_f1 = f1_score(val_labels, val_preds, average='macro')
    val_weighted_f1 = f1_score(val_labels, val_preds, average='weighted')
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_macro_f1s.append(val_macro_f1)
    val_weighted_f1s.append(val_weighted_f1)
    val_accs.append(val_accuracy)
    
    print(f"Epoch {epoch + 1}:")
    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}")
    print(f"Train Macro F1: {train_macro_f1:.4f}, Val Macro F1: {val_macro_f1:.4f}")
    print(f"Train Weighted F1: {train_weighted_f1:.4f}, Val Weighted F1: {val_weighted_f1:.4f}")
    

Training 1:   0%|          | 0/125 [00:00<?, ?it/s]

Training 1: 100%|██████████| 125/125 [01:46<00:00,  1.18it/s]
Validation 1: 100%|██████████| 32/32 [00:08<00:00,  3.91it/s]


Epoch 1:
Train Loss: 0.2779, Val Loss: 0.1690
Train Accuracy: 0.2697, Val Accuracy: 0.3233
Train Macro F1: 0.4213, Val Macro F1: 0.5019
Train Weighted F1: 0.6871, Val Weighted F1: 0.7431


Training 2: 100%|██████████| 125/125 [01:44<00:00,  1.20it/s]
Validation 2: 100%|██████████| 32/32 [00:08<00:00,  3.75it/s]


Epoch 2:
Train Loss: 0.1684, Val Loss: 0.1617
Train Accuracy: 0.3331, Val Accuracy: 0.3504
Train Macro F1: 0.5307, Val Macro F1: 0.5439
Train Weighted F1: 0.7574, Val Weighted F1: 0.7650


Training 3: 100%|██████████| 125/125 [01:46<00:00,  1.18it/s]
Validation 3: 100%|██████████| 32/32 [00:08<00:00,  3.83it/s]


Epoch 3:
Train Loss: 0.1584, Val Loss: 0.1554
Train Accuracy: 0.3551, Val Accuracy: 0.3383
Train Macro F1: 0.5790, Val Macro F1: 0.5661
Train Weighted F1: 0.7802, Val Weighted F1: 0.7736


Training 4: 100%|██████████| 125/125 [01:46<00:00,  1.17it/s]
Validation 4: 100%|██████████| 32/32 [00:08<00:00,  3.85it/s]


Epoch 4:
Train Loss: 0.1516, Val Loss: 0.1514
Train Accuracy: 0.3669, Val Accuracy: 0.3453
Train Macro F1: 0.6023, Val Macro F1: 0.5957
Train Weighted F1: 0.7897, Val Weighted F1: 0.7759


Training 5: 100%|██████████| 125/125 [01:38<00:00,  1.27it/s]
Validation 5: 100%|██████████| 32/32 [00:07<00:00,  4.18it/s]


Epoch 5:
Train Loss: 0.1465, Val Loss: 0.1491
Train Accuracy: 0.3681, Val Accuracy: 0.3514
Train Macro F1: 0.6164, Val Macro F1: 0.6270
Train Weighted F1: 0.7965, Val Weighted F1: 0.7883


Training 6: 100%|██████████| 125/125 [01:44<00:00,  1.19it/s]
Validation 6: 100%|██████████| 32/32 [00:08<00:00,  3.65it/s]


Epoch 6:
Train Loss: 0.1427, Val Loss: 0.1494
Train Accuracy: 0.3722, Val Accuracy: 0.3504
Train Macro F1: 0.6357, Val Macro F1: 0.6280
Train Weighted F1: 0.8024, Val Weighted F1: 0.7917


Training 7: 100%|██████████| 125/125 [01:44<00:00,  1.20it/s]
Validation 7: 100%|██████████| 32/32 [00:08<00:00,  3.67it/s]


Epoch 7:
Train Loss: 0.1375, Val Loss: 0.1545
Train Accuracy: 0.3807, Val Accuracy: 0.3473
Train Macro F1: 0.6488, Val Macro F1: 0.6346
Train Weighted F1: 0.8099, Val Weighted F1: 0.7890


Training 8: 100%|██████████| 125/125 [01:49<00:00,  1.14it/s]
Validation 8: 100%|██████████| 32/32 [00:09<00:00,  3.52it/s]


Epoch 8:
Train Loss: 0.1322, Val Loss: 0.1510
Train Accuracy: 0.4015, Val Accuracy: 0.3504
Train Macro F1: 0.6600, Val Macro F1: 0.6554
Train Weighted F1: 0.8178, Val Weighted F1: 0.7949


Training 9: 100%|██████████| 125/125 [01:51<00:00,  1.12it/s]
Validation 9: 100%|██████████| 32/32 [00:09<00:00,  3.39it/s]


Epoch 9:
Train Loss: 0.1275, Val Loss: 0.1507
Train Accuracy: 0.4075, Val Accuracy: 0.3584
Train Macro F1: 0.6762, Val Macro F1: 0.6478
Train Weighted F1: 0.8268, Val Weighted F1: 0.7942


Training 10: 100%|██████████| 125/125 [01:53<00:00,  1.10it/s]
Validation 10: 100%|██████████| 32/32 [00:09<00:00,  3.37it/s]

Epoch 10:
Train Loss: 0.1218, Val Loss: 0.1522
Train Accuracy: 0.4162, Val Accuracy: 0.3443
Train Macro F1: 0.6920, Val Macro F1: 0.6544
Train Weighted F1: 0.8351, Val Weighted F1: 0.7962





In [45]:
torch.save(model, "Cross Attention Model New/CrossAttentionModel.pt")
pickle.dump({
    "train_losses": train_losses,
    "val_losses": val_losses,
    "train_macro_f1s": train_macro_f1s,
    "val_macro_f1s": val_macro_f1s,
    "train_weighted_f1s": train_weighted_f1s,
    "val_weighted_f1s": val_weighted_f1s,
    "train_accs": train_accs,
    "val_accs": val_accs
}, open("Cross Attention Model New/CrossAttentionModelMetrics.pkl", "wb"))

#### Simple Model

In [33]:
class Simple(nn.Module):
    def __init__(self, embed_dim, num_labels):
        super(Simple, self).__init__()
        self.embed_dim = embed_dim
        self.num_labels = num_labels

        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_labels)
        )

    def forward(self, x1, x2):
        x = x2.mean(dim=1)
        logits = self.classifier(x)
        return logits

In [34]:
simple_model = Simple(embed_dim=768, num_labels=21)
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(simple_model.parameters(), lr=1e-4)

train_losses = []
val_losses = []
train_macro_f1s = []
val_macro_f1s = []
train_weighted_f1s = []
val_weighted_f1s = []
train_accs = []
val_accs = []

for epoch in range(10):
    simple_model.train()
    train_loss = 0
    train_preds = []
    train_labels = []
    for adapter_embeds, nmt_embeds, labels in tqdm(train_dataloader, desc=f'Training {epoch + 1}'):
        optimizer.zero_grad()
        logits = simple_model(adapter_embeds, nmt_embeds)
        loss = loss_function(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_preds.extend(torch.sigmoid(logits).detach().numpy())
        train_labels.extend(labels.detach().numpy())
    train_loss /= len(train_dataloader)
    train_losses.append(train_loss)
    train_preds = np.array(train_preds) > 0.5
    train_labels = np.array(train_labels)
    train_macro_f1 = f1_score(train_labels, train_preds, average='macro')
    train_weighted_f1 = f1_score(train_labels, train_preds, average='weighted')
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_macro_f1s.append(train_macro_f1)
    train_weighted_f1s.append(train_weighted_f1)
    train_accs.append(train_accuracy)
    
    simple_model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    for adapter_embeds, nmt_embeds, labels in tqdm(val_dataloader, desc=f'Validation {epoch + 1}'):
        with torch.no_grad():
            logits = simple_model(adapter_embeds, nmt_embeds)
            loss = loss_function(logits, labels)
            val_loss += loss.item()
            val_preds.extend(torch.sigmoid(logits).detach().numpy())
            val_labels.extend(labels.detach().numpy())
    val_loss /= len(val_dataloader)
    val_losses.append(val_loss)
    val_preds = np.array(val_preds) > 0.5
    val_labels = np.array(val_labels)
    val_macro_f1 = f1_score(val_labels, val_preds, average='macro')
    val_weighted_f1 = f1_score(val_labels, val_preds, average='weighted')
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_macro_f1s.append(val_macro_f1)
    val_weighted_f1s.append(val_weighted_f1)
    val_accs.append(val_accuracy)
    
    print(f"Epoch {epoch + 1}:")
    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}")
    print(f"Train Macro F1: {train_macro_f1:.4f}, Val Macro F1: {val_macro_f1:.4f}")
    print(f"Train Weighted F1: {train_weighted_f1:.4f}, Val Weighted F1: {val_weighted_f1:.4f}")

Training 1:   0%|          | 0/125 [00:00<?, ?it/s]

Training 1: 100%|██████████| 125/125 [00:01<00:00, 95.27it/s]
Validation 1: 100%|██████████| 32/32 [00:00<00:00, 152.86it/s]


Epoch 1:
Train Loss: 0.3340, Val Loss: 0.2167
Train Accuracy: 0.1282, Val Accuracy: 0.2112
Train Macro F1: 0.2223, Val Macro F1: 0.3172
Train Weighted F1: 0.4988, Val Weighted F1: 0.6239


Training 2: 100%|██████████| 125/125 [00:01<00:00, 100.08it/s]
Validation 2: 100%|██████████| 32/32 [00:00<00:00, 174.27it/s]


Epoch 2:
Train Loss: 0.1985, Val Loss: 0.1853
Train Accuracy: 0.2454, Val Accuracy: 0.2863
Train Macro F1: 0.3910, Val Macro F1: 0.4495
Train Weighted F1: 0.6729, Val Weighted F1: 0.7104


Training 3: 100%|██████████| 125/125 [00:01<00:00, 101.35it/s]
Validation 3: 100%|██████████| 32/32 [00:00<00:00, 159.93it/s]


Epoch 3:
Train Loss: 0.1824, Val Loss: 0.1766
Train Accuracy: 0.2785, Val Accuracy: 0.2953
Train Macro F1: 0.4631, Val Macro F1: 0.4835
Train Weighted F1: 0.7179, Val Weighted F1: 0.7218


Training 4: 100%|██████████| 125/125 [00:01<00:00, 102.44it/s]
Validation 4: 100%|██████████| 32/32 [00:00<00:00, 144.91it/s]


Epoch 4:
Train Loss: 0.1754, Val Loss: 0.1726
Train Accuracy: 0.3013, Val Accuracy: 0.3183
Train Macro F1: 0.4956, Val Macro F1: 0.5122
Train Weighted F1: 0.7347, Val Weighted F1: 0.7431


Training 5: 100%|██████████| 125/125 [00:01<00:00, 101.29it/s]
Validation 5: 100%|██████████| 32/32 [00:00<00:00, 165.33it/s]


Epoch 5:
Train Loss: 0.1717, Val Loss: 0.1690
Train Accuracy: 0.3183, Val Accuracy: 0.3203
Train Macro F1: 0.5172, Val Macro F1: 0.5107
Train Weighted F1: 0.7478, Val Weighted F1: 0.7454


Training 6: 100%|██████████| 125/125 [00:01<00:00, 94.49it/s] 
Validation 6: 100%|██████████| 32/32 [00:00<00:00, 161.59it/s]


Epoch 6:
Train Loss: 0.1686, Val Loss: 0.1675
Train Accuracy: 0.3216, Val Accuracy: 0.3313
Train Macro F1: 0.5292, Val Macro F1: 0.5200
Train Weighted F1: 0.7527, Val Weighted F1: 0.7480


Training 7:  45%|████▍     | 56/125 [00:00<00:00, 102.90it/s]


KeyboardInterrupt: 