This notebook demonstrates steps of creating a pipeline of model training and testing

<!-- %pip install -q transformers datasets evaluate -->

In [None]:
!git clone https://github.com/OopsWrongCode/nlp-project.git

In [None]:
%cd nlp-project/

In [None]:
%pip install -q transformers

In [None]:
import random
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
# from datasets import Dataset

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
train = pd.read_csv('/kaggle/working/nlp-project/data/train.csv')
test = pd.read_csv('/kaggle/working/nlp-project/data/test.csv')
validation = pd.read_csv('/kaggle/working/nlp-project/data/valid.csv')

In [None]:
train['token_count'] = [len(sentence.split()) for sentence in train['text']]
train['text_length'] = [len(seq) for seq in train['text']]

In [None]:
train.head()

In [None]:
validation.head()

In [None]:
test.head()

In [None]:
mbti_types = train['label'].unique().tolist()

label_encoder = LabelEncoder()
label_encoder.fit(mbti_types)

train['label'] = label_encoder.transform(train['label'])
validation['label'] = label_encoder.transform(validation['label'])
test['label'] = label_encoder.transform(test['label'])


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class MBTIDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.texts[index])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.labels is not None:
            label = self.labels[index]
            return {
                'input_ids': encoding['input_ids'].squeeze(0),  # (max_len)
                'attention_mask': encoding['attention_mask'].squeeze(0),  # (max_len)
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(0),  # (max_len)
                'attention_mask': encoding['attention_mask'].squeeze(0),  # (max_len)
            }

    def __len__(self):
        return len(self.texts)

In [None]:
print(f"Max len = {np.max(train['token_count'])}\nMin len = {np.min(train['token_count'])}\nAvg len = {np.round(np.mean(train['token_count']), 2)}")

In [None]:
BATCH_SIZE = 64
MAX_LEN = np.max(train['token_count'])
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = MBTIDataset(texts=train['text'].tolist(), labels=train['label'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN)
test_dataset = MBTIDataset(texts=test['text'].tolist(),labels=test['label'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN)
validation_dataset = MBTIDataset(texts=validation['text'].tolist(), labels=validation['label'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)

For regularization, we employ two commonly used
techniques: dropout (Hinton et al., 2012) and L2
weight regularization. We apply dropout to prevent co-adaptation. In our model, we either apply
dropout to word vectors before feeding the sequence
of words into the convolutional layer or to the output
of LSTM before the softmax layer. The L2 regularization is applied to the weight of the softmax layer. (https://arxiv.org/pdf/1511.08630)

In [None]:
class MyLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim=6, num_layers=1, bidirectional=False, dropout=0.3, fc_dropout=0.3, input_dropout=0.2):
        super(MyLSTM, self).__init__()

        
        self.input_dropout = nn.Dropout(input_dropout) # 

        self.lstm = nn.LSTM(
            input_size=embedding_dim, # 768
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        # LAYER 2: Fully-connected
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(
            hidden_dim * (2 if bidirectional else 1),
            output_dim
        )

    def forward(self, bert_embeddings):  # [batch_size, seq_len, embedding_dim]

        # Dropout on BERT-embeddings
        x = self.input_dropout(bert_embeddings)

        lstm_output, (h_n, c_n) = self.lstm(x)

        if self.lstm.bidirectional:
            h_final = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_final = h_n[-1]

        h_final = self.fc_dropout(h_final)
        out = self.fc(h_final)
        return out

In [None]:
class MyGRU(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim=6, num_layers=1, bidirectional=False, dropout=0.3, fc_dropout=0.1):
        super(MyGRU, self).__init__()

        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)

    def forward(self, x): 
        gru_output, h_n = self.gru(x)

        if self.gru.bidirectional:
            h_final = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_final = h_n[-1]

        h_final = self.fc_dropout(h_final)
        out = self.fc(h_final)
        return out

In [None]:
# C-LSTM: https://arxiv.org/pdf/1511.08630

class HybridNN(nn.Module):
    def __init__(self, embedding_dim=768, conv_out_channels=256, kernel_size=3, 
                 hidden_dim=256, output_dim=6, lstm_layers=2, bidirectional=True, 
                 dropout=0.4, fc_dropout=0.2, input_dropout=0.1):
        super(HybridNN, self).__init__()

        self.input_dropout = nn.Dropout(input_dropout)

        # CNN block
        self.conv1 = nn.Conv1d(in_channels=embedding_dim,
                               out_channels=conv_out_channels,
                               kernel_size=kernel_size,
                               padding=1)
        self.conv2 = nn.Conv1d(in_channels=conv_out_channels,
                               out_channels=conv_out_channels,
                               kernel_size=kernel_size,
                               padding=1)

        self.spatial_dropout = nn.Dropout2d(0.3)
        self.relu = nn.ReLU()

        # LSTM
        self.lstm = nn.LSTM(input_size=conv_out_channels,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layers,
                            bidirectional=bidirectional,
                            dropout=dropout if lstm_layers > 1 else 0,
                            batch_first=True)

        self.attn_linear = nn.Linear(hidden_dim * (2 if bidirectional else 1), 1)

        # Output
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)

    def forward(self, embeddings):
        # Dropout input
        x = self.input_dropout(embeddings)  # [batch_size, seq_len, embedding_dim]
        # Conv1D
        x = x.permute(0, 2, 1)                 # [B, seq, emb]
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.permute(0, 2, 1)                 # LSTM
        x = self.spatial_dropout(x) 


        # LSTM
        lstm_out, _ = self.lstm(x)  # [batch_size, seq_len, hidden_dim*2]

        #attention mechanism
        attn_weights = torch.softmax(self.attn_linear(lstm_out), dim=1)  # [batch_size, seq_len, 1]
        attn_output = torch.sum(lstm_out * attn_weights, dim=1)  # [batch_size, hidden_dim*2]

        # FC
        out = self.fc_dropout(attn_output)
        out = self.fc(out)

        return out

In [None]:
from transformers import AutoModel

bert = AutoModel.from_pretrained("bert-base-uncased").to(DEVICE)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(train.label), y=train.label)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

In [None]:
class_weights

In [None]:
# class_weights[5] = class_weights[5] * 1.15

In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

loss_fn = nn.CrossEntropyLoss(weight=class_weights)
# VOCAB_SIZE = len(tokenizer)

gru_model = MyGRU(embedding_dim=768, hidden_dim=128, output_dim=6, num_layers=5, dropout=0.3,fc_dropout=0.2, bidirectional=True).to(DEVICE)

gru_optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
gru_scheduler = ReduceLROnPlateau(gru_optimizer, patience=3, factor=0.1)

In [None]:
# class_weights[5] = class_weights[5] * 1.1

In [None]:
lstm_model = MyLSTM(embedding_dim=768, hidden_dim=128, output_dim=6, num_layers=2, dropout=0.5, bidirectional=True).to(DEVICE)

lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001, weight_decay=0.0001)
# lstm_scheduler = ReduceLROnPlateau(lstm_optimizer, patience=3, factor=1e-2)
lstm_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(lstm_optimizer, T_0=5, T_mult=2)


In [None]:
hybrid_model = HybridNN(embedding_dim=768, conv_out_channels=128, kernel_size=4, 
                 hidden_dim=128, output_dim=6, lstm_layers=2, bidirectional=True, 
                 dropout=0.4, fc_dropout=0.2, input_dropout=0.1).to(DEVICE)

hybrid_optimizer = optim.Adam(hybrid_model.parameters(), lr=0.001, weight_decay=0.0001)
hybrid_scheduler = ReduceLROnPlateau(hybrid_optimizer, patience=3, factor=1e-2)

In [None]:
import copy

class EarlyStopper:
    def __init__(self, model, patience=3, min_delta=0.01, restore_best_weights=True, save_weights=False):
        self.model = model
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')
        self.best_weights = None
        self.restore_best_weights = restore_best_weights
        self.save_weights = save_weights

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss - self.min_delta:
            self.min_validation_loss = validation_loss
            self.counter = 0
            if self.restore_best_weights:
                self.best_weights = copy.deepcopy(self.model.state_dict())

        else:
            self.counter += 1
            if self.counter >= self.patience:
                if self.restore_best_weights and self.best_weights is not None:
                    self.model.load_state_dict(self.best_weights)
                    if self.save_weights:
                        torch.save(self.best_weights, f'{self.model.__class__.__name__}_best_weights.pt')
                        print(f"Weights has been saved")
                return True

        return False

# source: https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch

In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BertLSTMConvHybrid(nn.Module):
    def __init__(self, embedding_dim=768, lstm_hidden_dim=128, conv_out_dim=128,
                 num_classes=6, lstm_layers=1, dropout=0.5):
        super(BertLSTMConvHybrid, self).__init__()
        
        self.lstm1 = nn.LSTM(embedding_dim, lstm_hidden_dim, num_layers=lstm_layers,
                             dropout=dropout if lstm_layers > 1 else 0,
                             bidirectional=True, batch_first=True)
        
        self.conv1 = nn.Conv1d(in_channels=2 * lstm_hidden_dim, out_channels=conv_out_dim,
                               kernel_size=10, stride=3)

        self.dropout1 = nn.Dropout(dropout)
        
        self.lstm2 = nn.LSTM(conv_out_dim, lstm_hidden_dim, num_layers=lstm_layers,
                             dropout=dropout if lstm_layers > 1 else 0,
                             bidirectional=True, batch_first=True)
        
        self.conv2 = nn.Conv1d(in_channels=2 * lstm_hidden_dim, out_channels=conv_out_dim,
                               kernel_size=10, stride=3)

        self.dropout2 = nn.Dropout(dropout)

        self.global_max_pool = nn.AdaptiveMaxPool1d(1)

        self.fc = nn.Linear(conv_out_dim, num_classes)

    def forward(self, x):  # x = BERT embeddings [batch, seq_len, 768]
        x, _ = self.lstm1(x)  # → [batch, seq_len, 2*hidden]
        x = self.dropout1(x)

        x = x.permute(0, 2, 1)  # → [batch, channels, seq_len]
        x = F.relu(self.conv1(x))  # → conv1
        x = x.permute(0, 2, 1)  # → back to [batch, seq_len, channels]

        x, _ = self.lstm2(x)
        x = self.dropout2(x)

        x = x.permute(0, 2, 1)
        x = F.relu(self.conv2(x))  # → conv2

        x = self.global_max_pool(x)  # → [batch, conv_out_dim, 1]
        x = x.squeeze(-1)

        x = self.fc(x)
        return x

In [65]:
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from sklearn.metrics import classification_report, accuracy_score

# === UNFREEZE ===
for name, param in bert.named_parameters():
    if "encoder.layer.10" in name or "encoder.layer.11" in name or "pooler" in name:
        param.requires_grad = True

# === FOCAL LOSS ===
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        loss = ((1 - pt) ** self.gamma) * ce_loss
        return loss.mean()

loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)

# === SETUP ===
EPOCHS = 25
lstm_model = MyLSTM(embedding_dim=768, hidden_dim=128, output_dim=6, num_layers=2, dropout=0.5, bidirectional=True, fc_dropout=0.3, input_dropout=0.2).to(DEVICE)
MODEL = BertLSTMConvHybrid(
    embedding_dim=768,
    lstm_hidden_dim=128,
    conv_out_dim=128,
    num_classes=6,
    lstm_layers=1,
    dropout=0.5
).to(DEVICE)

OPTIMIZER = AdamW(list(MODEL.parameters()) + list(bert.parameters()), lr=2e-5, weight_decay=1e-4)
SCHEDULER = CosineAnnealingWarmRestarts(OPTIMIZER, T_0=5, T_mult=2)
early_stopper = EarlyStopper(patience=4, model=MODEL, min_delta=0.01, save_weights=False)

lr_history = []

# === TRAINING LOOP ===
for epoch in range(EPOCHS):
    MODEL.train()
    bert.train()
    running_loss = 0.0

    for batch in train_loader:
        OPTIMIZER.zero_grad()

        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        # BERT embedding
        bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = bert_output.last_hidden_state

        outputs = MODEL(embeddings)
        loss = loss_fn(outputs, labels)

        loss.backward()
        OPTIMIZER.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # === VALIDATION ===
    MODEL.eval()
    bert.eval()
    val_preds = []
    val_labels = []
    val_loss_total = 0.0

    with torch.no_grad():
        for batch in validation_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_output.last_hidden_state

            outputs = MODEL(embeddings)
            loss = loss_fn(outputs, labels)

            val_loss_total += loss.item()
            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss_total / len(validation_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    SCHEDULER.step(epoch)  # cosine uses epoch, not val_loss
    current_lr = OPTIMIZER.param_groups[0]['lr']
    lr_history.append(current_lr)

    if early_stopper.early_stop(avg_val_loss):
        print(f"🚨 Early stopping at epoch {epoch+1}")
        break

    print(f"🌀 Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

# === FINAL REPORT ===
class_report = classification_report(val_labels, val_preds, target_names=[str(i) for i in range(6)])
print(f"\n📊 Classification Report:\n{class_report}")
print("✅ Train finished")

🌀 Epoch 1/25 | Train Loss: 0.7673 | Val Loss: 0.3724 | Val Acc: 0.7462
🌀 Epoch 2/25 | Train Loss: 0.3147 | Val Loss: 0.3488 | Val Acc: 0.7738
🌀 Epoch 3/25 | Train Loss: 0.2432 | Val Loss: 0.3327 | Val Acc: 0.7903
🌀 Epoch 4/25 | Train Loss: 0.1745 | Val Loss: 0.3679 | Val Acc: 0.8023
🌀 Epoch 5/25 | Train Loss: 0.1239 | Val Loss: 0.3915 | Val Acc: 0.8108
🌀 Epoch 6/25 | Train Loss: 0.0955 | Val Loss: 0.4035 | Val Acc: 0.8103
🚨 Early stopping at epoch 7

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       274
           1       0.68      0.76      0.72       212
           2       0.91      0.80      0.85       703
           3       0.53      0.80      0.64       178
           4       0.94      0.80      0.86       550
           5       0.58      0.88      0.70        81

    accuracy                           0.80      1998
   macro avg       0.73      0.81      0.76      1998
weighted avg       0.83      0.8

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

EPOCHS = 25
MODEL = lstm_model
OPTIMIZER = lstm_optimizer
SCHEDULER = lstm_scheduler
lr_history = []
early_stopper = EarlyStopper(patience=4, model=MODEL, min_delta=0.01, save_weights=False)

for epoch in range(EPOCHS):
    MODEL.train()
    running_loss = 0.0

    for batch in train_loader:
        OPTIMIZER.zero_grad() # replace

        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        with torch.no_grad():  # BERT embed
            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_output.last_hidden_state  # [batch_size, seq_len, 768]

        outputs = MODEL(embeddings) #replace
        loss = loss_fn(outputs, labels)

        loss.backward()
        OPTIMIZER.step() #replace

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # === VALIDATION ===
    MODEL.eval() #replace
    val_preds = []
    val_labels = []
    val_loss_total = 0.0

    with torch.no_grad():
        for batch in validation_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_output.last_hidden_state

            outputs = MODEL(embeddings) #replace
            loss = loss_fn(outputs, labels)

            val_loss_total += loss.item()

            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss_total / len(validation_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    SCHEDULER.step(avg_val_loss) # replace 
    current_lr = OPTIMIZER.param_groups[0]['lr'] #replace
    lr_history.append(current_lr)

    if early_stopper.early_stop(avg_val_loss):             
        print(f"Early stopping at epoch {epoch+1}")
        break


    print(f"Epoch {epoch+1}/{EPOCHS} — Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

class_report = classification_report(val_labels, val_preds, target_names=[str(i) for i in range(6)])
print(f"Classification Report:\n{class_report}")

print("Train finished")

In [None]:
# from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score

# EPOCHS = 25
# MODEL = hybrid_model
# OPTIMIZER = hybrid_optimizer
# SCHEDULER = hybrid_scheduler
# lr_history = []
# early_stopper = EarlyStopper(patience=4, model=MODEL, min_delta=0.01, save_weights=False)

# for epoch in range(EPOCHS):
#     MODEL.train()
#     bert.train()
#     running_loss = 0.0

#     for batch in train_loader:
#         OPTIMIZER.zero_grad() # replace

#         input_ids = batch['input_ids'].to(DEVICE)
#         attention_mask = batch['attention_mask'].to(DEVICE)
#         labels = batch['labels'].to(DEVICE)

#         bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
#         embeddings = bert_output.last_hidden_state

#         outputs = MODEL(embeddings) #replace
#         loss = loss_fn(outputs, labels)

#         loss.backward()
#         OPTIMIZER.step() #replace

#         running_loss += loss.item()

#     avg_train_loss = running_loss / len(train_loader)

#     # === VALIDATION ===
#     MODEL.eval() #replace
#     val_preds = []
#     val_labels = []
#     val_loss_total = 0.0

#     with torch.no_grad():
#         for batch in validation_loader:
#             input_ids = batch['input_ids'].to(DEVICE)
#             attention_mask = batch['attention_mask'].to(DEVICE)
#             labels = batch['labels'].to(DEVICE)

#             bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
#             embeddings = bert_output.last_hidden_state

#             outputs = MODEL(embeddings) #replace
#             loss = loss_fn(outputs, labels)

#             val_loss_total += loss.item()

#             preds = torch.argmax(outputs, dim=1)
#             val_preds.extend(preds.cpu().numpy())
#             val_labels.extend(labels.cpu().numpy())

#     avg_val_loss = val_loss_total / len(validation_loader)
#     val_acc = accuracy_score(val_labels, val_preds)

#     SCHEDULER.step(avg_val_loss) # replace 
#     current_lr = OPTIMIZER.param_groups[0]['lr'] #replace
#     lr_history.append(current_lr)

#     if early_stopper.early_stop(avg_val_loss):             
#         print(f"Early stopping at epoch {epoch+1}")
#         break


#     print(f"Epoch {epoch+1}/{EPOCHS} — Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

# class_report = classification_report(val_labels, val_preds, target_names=[str(i) for i in range(6)])
# print(f"Classification Report:\n{class_report}")

# print("Train finished")

In [66]:
if early_stopper.best_weights is not None:
    MODEL.load_state_dict(early_stopper.best_weights)
    print("Best weights loaded after training")
else:
    print("No best weights were saved")

Best weights loaded after training


In [67]:
MODEL.load_state_dict(early_stopper.best_weights)
MODEL.eval()

test_preds = []
test_labels = []
test_loss_total = 0.0

with torch.no_grad():

    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = bert_output.last_hidden_state

        outputs = MODEL(embeddings)
        loss = loss_fn(outputs, labels)
        test_loss_total += loss.item()


        preds = torch.argmax(outputs, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())



avg_test_loss = test_loss_total / len(test_loader)
test_acc = accuracy_score(test_labels, test_preds)

print(f"Validation Loss: {avg_test_loss:.4f} | Validation Accuracy: {test_acc:.4f}")

Validation Loss: 0.4016 | Validation Accuracy: 0.7800


In [None]:
plt.plot(range(1, len(lr_history) + 1), lr_history)
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.title("Learning Rate Schedule")
plt.grid(True)
plt.show()