## Use Case Focus

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import numpy as np
from datetime import datetime
import itertools
import os

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Constants and model list
MODEL_NAMES = [
    "albert/albert-base-v2",
    "albert/albert-large-v2",
    "microsoft/deberta-v3-large",
    # "microsoft/deberta-v2-xlarge",
    "FacebookAI/roberta-large",
    "google-bert/bert-base-uncased",
    "ProsusAI/finbert",
    "PHILIPPUNI/distilbert-amazon-software-reviews-finetuned",
    "justinlamlamlam/softwareengineering",
    "answerdotai/ModernBERT-large",
    "milyiyo/distilbert-base-uncased-finetuned-amazon-review"
]
LABEL_COLUMN = "label"

# Hyperparameter grid
EPOCHS_LIST = [1, 3, 4, 9, 12]
BATCH_SIZES = [8]
N_SPLITS_LIST = [5, 10]

# Load data
data = pd.read_csv("type_val_single.csv")
label_encoder = LabelEncoder()
data[LABEL_COLUMN] = label_encoder.fit_transform(data[LABEL_COLUMN])
texts = data["sentence"].tolist()
labels = data[LABEL_COLUMN].tolist()

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Training loop
def train_model(model, train_loader, optimizer, criterion, epochs, fold):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}")
        for batch in progress_bar:
            optimizer.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

# Evaluation
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)
            preds = model(**inputs).logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

# Generate folds once and reuse them
def generate_kfold_splits(texts, labels, n_splits):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    return list(skf.split(texts, labels))

# Save best result to file
def save_best_result(log_path, header, result, best_combo_summary):
    with open(log_path, "w") as f:
        f.write(header + "\n\n")
        f.write(result + "\n")
        f.write("\n===== BEST COMBINATION =====\n")
        f.write(best_combo_summary + "\n")

# Begin experiment
for n_splits in N_SPLITS_LIST:
    folds = generate_kfold_splits(texts, labels, n_splits)

    for model_name in MODEL_NAMES:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        safe_model_name = model_name.replace('/', '-')
        log_file = f"{safe_model_name}_{LABEL_COLUMN}.txt"

        best_f1 = 0
        best_result = ""
        best_combo = ""

        for epochs, batch_size in itertools.product(EPOCHS_LIST, BATCH_SIZES):
            print(f"\nModel: {model_name} | EPOCHS={epochs}, BATCH_SIZE={batch_size}, K-FOLD={n_splits}")
            all_accuracies, all_precisions, all_recalls, all_f1s = [], [], [], []

            for fold, (train_idx, val_idx) in enumerate(folds):
                train_texts = [texts[i] for i in train_idx]
                val_texts = [texts[i] for i in val_idx]
                train_labels = [labels[i] for i in train_idx]
                val_labels = [labels[i] for i in val_idx]

                train_dataset = TextDataset(train_texts, train_labels, tokenizer)
                val_dataset = TextDataset(val_texts, val_labels, tokenizer)

                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

                model = AutoModelForSequenceClassification.from_pretrained(
                    model_name,
                    num_labels=len(set(labels)),
                    ignore_mismatched_sizes=True
                ).to(device)

                optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
                criterion = torch.nn.CrossEntropyLoss()

                train_model(model, train_loader, optimizer, criterion, epochs, fold)
                y_true, y_pred = evaluate_model(model, val_loader)

                acc = accuracy_score(y_true, y_pred)
                prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

                all_accuracies.append(acc)
                all_precisions.append(prec)
                all_recalls.append(rec)
                all_f1s.append(f1)

            avg_accuracy = np.mean(all_accuracies)
            avg_precision = np.mean(all_precisions)
            avg_recall = np.mean(all_recalls)
            avg_f1 = np.mean(all_f1s)

            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_combo = f"Model: {model_name}\nLabel Column: {LABEL_COLUMN}\nEpochs: {epochs}, Batch Size: {batch_size}, K-Fold: {n_splits}"
                best_result = (
                    f"Accuracy: {avg_accuracy:.4f}\n"
                    f"Precision: {avg_precision:.4f}\n"
                    f"Recall: {avg_recall:.4f}\n"
                    f"F1-Score: {avg_f1:.4f}"
                )

        # Save best result to file
        header = f"Best Hyperparameter Combination for {model_name} on {LABEL_COLUMN}"
        save_best_result(log_file, header, best_result, best_combo)
        print(f"\nBest combination saved to {log_file}")



Model: albert/albert-base-v2 | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:02<00:00,  7.45it/s, loss=1.15]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.99it/s, loss=1.25]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/a


Model: albert/albert-base-v2 | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.92it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  9.07it/s, loss=0.965]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  9.03it/s, loss=0.817]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.95it/s, loss=1.07]
Fold 2 Epoch 2: 100%|██████


Model: albert/albert-base-v2 | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  9.02it/s, loss=1.07]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  9.03it/s, loss=0.931]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  9.03it/s, loss=0.798]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.96it/s, loss=0.649]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|█████


Model: albert/albert-base-v2 | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  9.07it/s, loss=1.15]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  9.01it/s, loss=0.955]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.99it/s, loss=0.726]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  9.10it/s, loss=0.555]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  9.00it/s, loss=0.44] 
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  9.01it/s, loss=0.326]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:01<00:00,  9.00it/s, loss=0.275]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:01<00:00,  8.99it/s, loss=0.179]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:01<00:00,  9.00it/s, loss=0.143]
  _warn_prf(averag


Model: albert/albert-base-v2 | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.94it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  9.02it/s, loss=1.03] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.98it/s, loss=1.32]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.97it/s, loss=1.14]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  8.96it/s, loss=1.03] 
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  9.04it/s, loss=0.985]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:01<00:00,  8.98it/s, loss=0.928]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:01<00:00,  9.00it/s, loss=0.722]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:01<00:00,  9.00it/s, loss=0.594]
Fold 1 Epoch 10: 100


Best combination saved to albert-albert-base-v2_label.txt

Model: albert/albert-large-v2 | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.26it/s, loss=1.11]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.24it/s, loss=1.19]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert


Model: albert/albert-large-v2 | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.27it/s, loss=1.13]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.943]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.954]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.26it/s, loss=1.22]
Fold 2 Epoch 2: 100%|████


Model: albert/albert-large-v2 | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.24it/s, loss=1.1] 
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:04<00:00,  3.26it/s, loss=0.991]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.894]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.732]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|███


Model: albert/albert-large-v2 | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=1.15]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:04<00:00,  3.27it/s, loss=0.979]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.83] 
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.645]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.497]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:04<00:00,  3.17it/s, loss=0.409]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:05<00:00,  2.76it/s, loss=0.355]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:05<00:00,  2.65it/s, loss=0.402]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:04<00:00,  3.26it/s, loss=0.302]
  _warn_prf(avera


Model: albert/albert-large-v2 | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.24it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:04<00:00,  3.24it/s, loss=0.99] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.997]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.92] 
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:04<00:00,  3.24it/s, loss=0.82] 
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:04<00:00,  3.23it/s, loss=0.681]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.575]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:04<00:00,  3.25it/s, loss=0.463]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:04<00:00,  3.24it/s, loss=0.447]
Fold 1 Epoch 10: 


Best combination saved to albert-albert-large-v2_label.txt





Model: microsoft/deberta-v3-large | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:07<00:00,  2.10it/s, loss=1.19]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:06<00:00,  2.20it/s, loss=1.19]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
So


Model: microsoft/deberta-v3-large | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:07<00:00,  2.11it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:06<00:00,  2.19it/s, loss=1.03]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:06<00:00,  2.21it/s, loss=0.928]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: microsoft/deberta-v3-large | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:06<00:00,  2.16it/s, loss=1.4] 
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:06<00:00,  2.17it/s, loss=1.06]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:06<00:00,  2.17it/s, loss=0.956]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:06<00:00,  2.17it/s, loss=0.643]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this mod


Model: microsoft/deberta-v3-large | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:07<00:00,  2.05it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:07<00:00,  2.02it/s, loss=1.01] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:07<00:00,  2.03it/s, loss=0.936]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:07<00:00,  2.02it/s, loss=0.673]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:07<00:00,  2.03it/s, loss=0.516]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:07<00:00,  2.02it/s, loss=0.301]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:07<00:00,  2.03it/s, loss=0.178]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:07<00:00,  2.01it/s, loss=0.137]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00


Model: microsoft/deberta-v3-large | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:07<00:00,  2.14it/s, loss=1.26]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:06<00:00,  2.14it/s, loss=1.06] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:07<00:00,  2.14it/s, loss=0.954]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:07<00:00,  2.14it/s, loss=0.634]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:07<00:00,  2.13it/s, loss=0.462]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:07<00:00,  2.14it/s, loss=0.287]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:06<00:00,  2.15it/s, loss=0.196]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:07<00:00,  2.14it/s, loss=0.171]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00


Best combination saved to microsoft-deberta-v3-large_label.txt

Model: FacebookAI/roberta-large | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.83it/s, loss=1.28]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.85it/s, loss=1.35]
  _warn_prf(average, modifier, 


Model: FacebookAI/roberta-large | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.91it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.90it/s, loss=1.08] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.96it/s, loss=1.06]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be a


Model: FacebookAI/roberta-large | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.09it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:04<00:00,  3.08it/s, loss=1.13]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:04<00:00,  3.10it/s, loss=1.04]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:04<00:00,  3.07it/s, loss=0.962]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.


Model: FacebookAI/roberta-large | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:04<00:00,  3.09it/s, loss=1.22]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:04<00:00,  3.09it/s, loss=1.08]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:04<00:00,  3.09it/s, loss=1.04]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:04<00:00,  3.09it/s, loss=1.1] 
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:04<00:00,  3.10it/s, loss=1.08]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:04<00:00,  3.10it/s, loss=1.15]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:04<00:00,  3.09it/s, loss=1.05]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:04<00:00,  3.10it/s, loss=1.04]
Fold 1 Epoch 9: 100%|██████


Model: FacebookAI/roberta-large | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.87it/s, loss=1.19]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.85it/s, loss=1.07]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.86it/s, loss=0.986]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:05<00:00,  2.86it/s, loss=0.951]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:05<00:00,  2.86it/s, loss=0.798]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:05<00:00,  2.84it/s, loss=0.699]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:05<00:00,  2.85it/s, loss=0.483]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:05<00:00,  2.86it/s, loss=0.58] 
Fold 1 Epoch 9: 100%|


Best combination saved to FacebookAI-roberta-large_label.txt

Model: google-bert/bert-base-uncased | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.37it/s, loss=1.23]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.39it/s, loss=1.22]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a


Model: google-bert/bert-base-uncased | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.42it/s, loss=1.25]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.36it/s, loss=1.02] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.42it/s, loss=0.84] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.41it/s, loss=1.18]
Fold 2 Epoch 2:


Model: google-bert/bert-base-uncased | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.60it/s, loss=1.27]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.66it/s, loss=1.1] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.60it/s, loss=0.963]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.15it/s, loss=0.887]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1:


Model: google-bert/bert-base-uncased | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.45it/s, loss=1.2] 
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.57it/s, loss=1.07]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.60it/s, loss=0.941]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.59it/s, loss=0.796]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  8.51it/s, loss=0.61] 
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  8.57it/s, loss=0.489]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:01<00:00,  8.62it/s, loss=0.374]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:01<00:00,  8.61it/s, loss=0.285]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:01<00:00,  8.56it/s, loss=0.202]
  _warn_prf(a


Model: google-bert/bert-base-uncased | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.44it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.53it/s, loss=1.01]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.45it/s, loss=0.898]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.44it/s, loss=0.73] 
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  8.51it/s, loss=0.572]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  8.53it/s, loss=0.418]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:01<00:00,  8.49it/s, loss=0.285]
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:01<00:00,  8.63it/s, loss=0.194]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:01<00:00,  8.46it/s, loss=0.151]
Fold 1 Epoch 


Best combination saved to google-bert-bert-base-uncased_label.txt

Model: ProsusAI/finbert | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.64it/s, loss=1.22]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.S


Model: ProsusAI/finbert | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.45it/s, loss=1.26]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.60it/s, loss=0.966]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.64it/s, loss=0.798]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classif


Model: ProsusAI/finbert | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.43it/s, loss=1.12]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.43it/s, loss=0.987]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.52it/s, loss=0.775]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.51it/s, loss=0.56] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/f


Model: ProsusAI/finbert | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.33it/s, loss=1.26]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.47it/s, loss=0.946]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.53it/s, loss=0.759]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.44it/s, loss=0.566]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  8.49it/s, loss=0.411]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  8.57it/s, loss=0.343]
Fold 1 Epoch 7: 100%|██████


Model: ProsusAI/finbert | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.45it/s, loss=1.08]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.51it/s, loss=0.956]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.46it/s, loss=0.828]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.58it/s, loss=0.632]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  8.53it/s, loss=0.431]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  8.50it/s, loss=0.31] 
Fold 1 Epoch 7: 100%|██████


Best combination saved to ProsusAI-finbert_label.txt

Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.51it/s, loss=1.19]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and to


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.80it/s, loss=1.15]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 16.92it/s, loss=0.953]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 16.72it/s, loss=0.801]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-softwa


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.93it/s, loss=1.3] 
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 16.63it/s, loss=1]    
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 16.91it/s, loss=0.855]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:00<00:00, 16.89it/s, loss=0.729]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not 


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 15.44it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 15.23it/s, loss=0.935]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 15.38it/s, loss=0.838]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:00<00:00, 15.38it/s, loss=0.644]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:00<00:00, 15.50it/s, loss=0.552]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:00<00:00, 15.38


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00, 14.89it/s, loss=1.27]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 15.15it/s, loss=0.97]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00, 14.95it/s, loss=0.802]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00, 14.92it/s, loss=0.68] 
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:00<00:00, 15.24it/s, loss=0.548]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:00<00:00, 15.02i


Best combination saved to PHILIPPUNI-distilbert-amazon-software-reviews-finetuned_label.txt

Model: justinlamlamlam/softwareengineering | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.22it/s, loss=1.3] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier


Model: justinlamlamlam/softwareengineering | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.65it/s, loss=1.26]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.68it/s, loss=0.995]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.57it/s, loss=0.928]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized becau


Model: justinlamlamlam/softwareengineering | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.57it/s, loss=1.33]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.55it/s, loss=1.02]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.61it/s, loss=0.888]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.56it/s, loss=0.7]  
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkp


Model: justinlamlamlam/softwareengineering | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  8.60it/s, loss=1.36]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  8.52it/s, loss=1.06]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  8.53it/s, loss=0.969]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  8.55it/s, loss=0.867]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  8.44it/s, loss=0.702]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  8.44it/s, loss=0.522]
Fold 1 Ep


Model: justinlamlamlam/softwareengineering | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:01<00:00,  9.40it/s, loss=1.28]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:01<00:00,  9.28it/s, loss=1.04] 
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:01<00:00,  9.44it/s, loss=0.985]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:01<00:00,  9.32it/s, loss=0.727]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:01<00:00,  9.30it/s, loss=0.525]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:01<00:00,  9.43it/s, loss=0.405]
Fold 1 E


Best combination saved to justinlamlamlam-softwareengineering_label.txt

Model: answerdotai/ModernBERT-large | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:08<00:00,  1.70it/s, loss=1.06]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.72it/s, loss=1.21]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the mo


Model: answerdotai/ModernBERT-large | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.73it/s, loss=1.2] 
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.62it/s, loss=0.621]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.65it/s, loss=0.269]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.70it/s, loss=1.13]
Fold 


Model: answerdotai/ModernBERT-large | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.71it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.72it/s, loss=0.715]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.72it/s, loss=0.353]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:05<00:00,  2.71it/s, loss=0.108]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold


Model: answerdotai/ModernBERT-large | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.88it/s, loss=1.08]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=0.569]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=0.213]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:05<00:00,  2.88it/s, loss=0.172]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=0.0554]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=0.0285]
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:05<00:00,  2.90it/s, loss=0.0249] 
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=0.0274] 
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=0.0175] 


Model: answerdotai/ModernBERT-large | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.89it/s, loss=1.15]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.90it/s, loss=0.521]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.91it/s, loss=0.171]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:05<00:00,  2.90it/s, loss=0.0653]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:05<00:00,  2.92it/s, loss=0.0243]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:05<00:00,  2.91it/s, loss=0.0375] 
Fold 1 Epoch 7: 100%|██████████| 15/15 [00:05<00:00,  2.91it/s, loss=0.0303] 
Fold 1 Epoch 8: 100%|██████████| 15/15 [00:05<00:00,  2.90it/s, loss=0.0199]
Fold 1 Epoch 9: 100%|██████████| 15/15 [00:05<00:00,  2.91it/s, loss=0.02]  


Best combination saved to answerdotai-ModernBERT-large_label.txt

Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=1, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.52it/s, loss=1.3] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.69it/s, loss=1.25]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 16.79it/s, loss=0.975]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 17.10it/s, loss=0.787]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-fin


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=4, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.75it/s, loss=1.23]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 16.93it/s, loss=1.01]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 16.93it/s, loss=0.842]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:00<00:00, 17.10it/s, loss=0.74] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not i


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=9, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.92it/s, loss=1.23]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 16.97it/s, loss=0.976]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 16.82it/s, loss=0.845]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:00<00:00, 17.01it/s, loss=0.648]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:00<00:00, 17.00it/s, loss=0.474]
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:00<00:00, 17.06


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=12, BATCH_SIZE=8, K-FOLD=5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 15/15 [00:00<00:00, 16.88it/s, loss=1.21]
Fold 1 Epoch 2: 100%|██████████| 15/15 [00:00<00:00, 17.23it/s, loss=0.952]
Fold 1 Epoch 3: 100%|██████████| 15/15 [00:00<00:00, 16.95it/s, loss=0.821]
Fold 1 Epoch 4: 100%|██████████| 15/15 [00:00<00:00, 17.08it/s, loss=0.671]
Fold 1 Epoch 5: 100%|██████████| 15/15 [00:00<00:00, 16.87it/s, loss=0.53] 
Fold 1 Epoch 6: 100%|██████████| 15/15 [00:00<00:00, 17.35


Best combination saved to milyiyo-distilbert-base-uncased-finetuned-amazon-review_label.txt

Model: albert/albert-base-v2 | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  8.88it/s, loss=1.08]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  8.95it/s, loss=1.11]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/a


Model: albert/albert-base-v2 | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  8.98it/s, loss=1.15]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  8.92it/s, loss=1.09]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.03it/s, loss=1.03]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  8.95it/s, loss=1.1] 
Fold 2 Epoch 2: 100%|████████


Model: albert/albert-base-v2 | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  8.90it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  8.95it/s, loss=0.926]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  8.98it/s, loss=0.807]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.01it/s, loss=0.61] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|█████


Model: albert/albert-base-v2 | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  8.97it/s, loss=1.2] 
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.00it/s, loss=1.09]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  8.93it/s, loss=1.03]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.02it/s, loss=1.08]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  8.92it/s, loss=0.975]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  8.91it/s, loss=1.06] 
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:01<00:00,  8.94it/s, loss=0.981]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:01<00:00,  9.02it/s, loss=1]    
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:01<00:00,  8.89it/s, loss=0.991]
  _warn_prf(average, 


Model: albert/albert-base-v2 | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:02<00:00,  5.69it/s, loss=1.12]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:02<00:00,  5.67it/s, loss=0.899]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:02<00:00,  5.67it/s, loss=0.812]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:02<00:00,  5.70it/s, loss=0.722]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:02<00:00,  5.70it/s, loss=0.608]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:02<00:00,  5.73it/s, loss=0.513]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:02<00:00,  5.81it/s, loss=0.378]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:02<00:00,  5.75it/s, loss=0.3]  
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:02<00:00,  5.68it/s, loss=0.23] 
Fold 1 Epoch 10: 1


Best combination saved to albert-albert-base-v2_label.txt

Model: albert/albert-large-v2 | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  3.18it/s, loss=1.21]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  3.17it/s, loss=1.09]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert


Model: albert/albert-large-v2 | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  3.17it/s, loss=1.19]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:05<00:00,  3.19it/s, loss=1.01] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:05<00:00,  3.18it/s, loss=0.956]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  3.18it/s, loss=1.15]
Fold 2 Epoch 2: 100%|████


Model: albert/albert-large-v2 | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:07<00:00,  2.25it/s, loss=1.21]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:07<00:00,  2.14it/s, loss=0.94] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:07<00:00,  2.18it/s, loss=1.21]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:07<00:00,  2.22it/s, loss=1.16]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|█████


Model: albert/albert-large-v2 | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:08<00:00,  2.11it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:08<00:00,  2.11it/s, loss=1.12]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:08<00:00,  2.10it/s, loss=1.01]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:07<00:00,  2.13it/s, loss=1.06]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:08<00:00,  2.09it/s, loss=0.999]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:08<00:00,  2.10it/s, loss=0.977]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:08<00:00,  2.10it/s, loss=0.901]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:08<00:00,  2.10it/s, loss=0.88] 
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:08<00:00,  2.12it/s, loss=0.799]
  _warn_prf(average,


Model: albert/albert-large-v2 | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:08<00:00,  2.08it/s, loss=1.18]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:08<00:00,  2.12it/s, loss=0.992]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:08<00:00,  2.10it/s, loss=0.788]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:08<00:00,  2.11it/s, loss=0.713]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:07<00:00,  2.13it/s, loss=0.674]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:08<00:00,  2.11it/s, loss=0.554]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:08<00:00,  2.08it/s, loss=0.44] 
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:08<00:00,  2.07it/s, loss=0.352]
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:08<00:00,  2.06it/s, loss=0.33] 
Fold 1 Epoch 10: 


Best combination saved to albert-albert-large-v2_label.txt





Model: microsoft/deberta-v3-large | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:11<00:00,  1.51it/s, loss=1.1] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:11<00:00,  1.51it/s, loss=1.16]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
So


Model: microsoft/deberta-v3-large | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:11<00:00,  1.50it/s, loss=1.25]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:09<00:00,  1.76it/s, loss=1]    
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:08<00:00,  2.11it/s, loss=0.984]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference


Model: microsoft/deberta-v3-large | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:11<00:00,  1.47it/s, loss=1.14]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:11<00:00,  1.48it/s, loss=0.944]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:11<00:00,  1.48it/s, loss=0.77] 
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:11<00:00,  1.46it/s, loss=0.648]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this mo


Model: microsoft/deberta-v3-large | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:11<00:00,  1.50it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:11<00:00,  1.50it/s, loss=0.996]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:11<00:00,  1.51it/s, loss=0.814]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:11<00:00,  1.51it/s, loss=0.613]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:11<00:00,  1.51it/s, loss=0.454]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:11<00:00,  1.51it/s, loss=0.29] 
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:11<00:00,  1.50it/s, loss=0.177]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:11<00:00,  1.50it/s, loss=0.137]
Fold 1 Epoch 9: 100%|██████████| 17/17 [00


Model: microsoft/deberta-v3-large | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:11<00:00,  1.42it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:11<00:00,  1.44it/s, loss=1.05] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:11<00:00,  1.43it/s, loss=0.946]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:11<00:00,  1.44it/s, loss=0.736]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:11<00:00,  1.43it/s, loss=0.56] 
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:11<00:00,  1.43it/s, loss=0.28] 
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:11<00:00,  1.43it/s, loss=0.252]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:11<00:00,  1.44it/s, loss=0.116]
Fold 1 Epoch 9: 100%|██████████| 17/17 [00


Best combination saved to microsoft-deberta-v3-large_label.txt

Model: FacebookAI/roberta-large | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:08<00:00,  2.02it/s, loss=1.19]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:08<00:00,  2.01it/s, loss=1.16]
  _warn_prf(average, modifier, 


Model: FacebookAI/roberta-large | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:07<00:00,  2.16it/s, loss=1.31]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:07<00:00,  2.21it/s, loss=1.13]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=1.09]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be ab


Model: FacebookAI/roberta-large | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:07<00:00,  2.16it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:07<00:00,  2.18it/s, loss=1.12]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:07<00:00,  2.16it/s, loss=0.936]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:07<00:00,  2.18it/s, loss=0.861]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj


Model: FacebookAI/roberta-large | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=1.32]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=1.12]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=1.06]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:07<00:00,  2.16it/s, loss=1.03] 
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:07<00:00,  2.19it/s, loss=0.879]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:07<00:00,  2.15it/s, loss=0.818]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:07<00:00,  2.18it/s, loss=0.619]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=0.546]
Fold 1 Epoch 9: 100%|█


Model: FacebookAI/roberta-large | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:07<00:00,  2.15it/s, loss=1.22]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=1.2] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:07<00:00,  2.16it/s, loss=1.19]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:07<00:00,  2.18it/s, loss=1.06]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=1.11] 
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:07<00:00,  2.20it/s, loss=0.975]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:07<00:00,  2.17it/s, loss=0.898]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:07<00:00,  2.18it/s, loss=0.853]
Fold 1 Epoch 9: 100%|██


Best combination saved to FacebookAI-roberta-large_label.txt

Model: google-bert/bert-base-uncased | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.43it/s, loss=1.29]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.46it/s, loss=1.13]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a


Model: google-bert/bert-base-uncased | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.35it/s, loss=1.24]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.45it/s, loss=0.961]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.45it/s, loss=0.779]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.42it/s, loss=1.25]
Fold 2 Epoch 2:


Model: google-bert/bert-base-uncased | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.43it/s, loss=1.16]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.57it/s, loss=0.934]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.40it/s, loss=0.822]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.52it/s, loss=0.643]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1


Model: google-bert/bert-base-uncased | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.46it/s, loss=1.18]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.56it/s, loss=0.967]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.45it/s, loss=0.806]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.58it/s, loss=0.631]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  9.48it/s, loss=0.523]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  9.59it/s, loss=0.418]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:01<00:00,  9.43it/s, loss=0.334]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:01<00:00,  9.55it/s, loss=0.263]
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:01<00:00,  9.48it/s, loss=0.224]
  _warn_prf(


Model: google-bert/bert-base-uncased | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.54it/s, loss=1.23]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.66it/s, loss=1.04]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.56it/s, loss=0.903]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.66it/s, loss=0.778]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  9.57it/s, loss=0.68] 
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  9.68it/s, loss=0.575]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:01<00:00,  9.54it/s, loss=0.475]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:01<00:00,  9.68it/s, loss=0.419]
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:01<00:00,  9.57it/s, loss=0.4]  
Fold 1 Epoch 


Best combination saved to google-bert-bert-base-uncased_label.txt

Model: ProsusAI/finbert | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.62it/s, loss=1.25]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.S


Model: ProsusAI/finbert | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.51it/s, loss=1.15]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.69it/s, loss=0.9]  
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.55it/s, loss=0.728]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classif


Model: ProsusAI/finbert | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.61it/s, loss=1.02]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.68it/s, loss=0.928]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.59it/s, loss=0.76] 
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.72it/s, loss=0.637]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/f


Model: ProsusAI/finbert | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.48it/s, loss=1.3] 
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.57it/s, loss=0.974]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.66it/s, loss=0.783]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.66it/s, loss=0.655]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  9.67it/s, loss=0.492]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  9.66it/s, loss=0.411]
Fold 1 Epoch 7: 100%|██████


Model: ProsusAI/finbert | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.56it/s, loss=1.09]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.66it/s, loss=0.897]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.57it/s, loss=0.717]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.57it/s, loss=0.576]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  9.59it/s, loss=0.452]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  9.60it/s, loss=0.445]
Fold 1 Epoch 7: 100%|██████


Best combination saved to ProsusAI-finbert_label.txt

Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:00<00:00, 17.05it/s, loss=1.13]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and to


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:00<00:00, 17.21it/s, loss=1.2] 
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:00<00:00, 17.08it/s, loss=0.951]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:00<00:00, 17.54it/s, loss=0.798]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-softwa


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:00<00:00, 17.19it/s, loss=1.14]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:00<00:00, 17.65it/s, loss=0.964]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:00<00:00, 17.55it/s, loss=0.795]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:00<00:00, 17.22it/s, loss=0.631]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not 


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:00<00:00, 17.33it/s, loss=1.11]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00, 16.88it/s, loss=0.92] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:00<00:00, 17.32it/s, loss=0.788]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:00<00:00, 17.41it/s, loss=0.615]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:00<00:00, 17.24it/s, loss=0.512]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:00<00:00, 17.30


Model: PHILIPPUNI/distilbert-amazon-software-reviews-finetuned | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at PHILIPPUNI/distilbert-amazon-software-reviews-finetuned and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:00<00:00, 17.14it/s, loss=1.22]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:00<00:00, 17.37it/s, loss=0.912]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:00<00:00, 17.30it/s, loss=0.776]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:00<00:00, 17.34it/s, loss=0.62] 
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:00<00:00, 17.37it/s, loss=0.487]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:00<00:00, 17.15


Best combination saved to PHILIPPUNI-distilbert-amazon-software-reviews-finetuned_label.txt

Model: justinlamlamlam/softwareengineering | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.53it/s, loss=1.19]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier


Model: justinlamlamlam/softwareengineering | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.59it/s, loss=1.25]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.70it/s, loss=0.936]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.67it/s, loss=0.771]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized becau


Model: justinlamlamlam/softwareengineering | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.49it/s, loss=1.33]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.59it/s, loss=1.01]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.62it/s, loss=0.869]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.62it/s, loss=0.628]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of BertForSequenceClassification were not initialized from the model checkp


Model: justinlamlamlam/softwareengineering | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.69it/s, loss=1.31]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.65it/s, loss=0.994]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.67it/s, loss=0.855]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.69it/s, loss=0.633]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  9.67it/s, loss=0.452]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  9.65it/s, loss=0.301]
Fold 1 E


Model: justinlamlamlam/softwareengineering | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at justinlamlamlam/softwareengineering and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00,  9.52it/s, loss=1.41]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00,  9.56it/s, loss=1.01]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00,  9.62it/s, loss=0.911]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00,  9.59it/s, loss=0.676]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:01<00:00,  9.68it/s, loss=0.472]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00,  9.67it/s, loss=0.318]
Fold 1 Ep


Best combination saved to justinlamlamlam-softwareengineering_label.txt

Model: answerdotai/ModernBERT-large | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=1.02]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=1.14]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the mo


Model: answerdotai/ModernBERT-large | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  3.06it/s, loss=1.04]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:05<00:00,  3.07it/s, loss=0.438]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:05<00:00,  3.05it/s, loss=0.203]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  3.03it/s, loss=1.04]
Fold 


Model: answerdotai/ModernBERT-large | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  2.96it/s, loss=1.14]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:05<00:00,  3.12it/s, loss=0.594]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=0.371]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=0.135]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold


Model: answerdotai/ModernBERT-large | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  2.98it/s, loss=1.12]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=0.603]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=0.282]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:05<00:00,  2.98it/s, loss=0.0663]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:05<00:00,  2.98it/s, loss=0.0334]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:05<00:00,  2.98it/s, loss=0.029]  
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:05<00:00,  2.97it/s, loss=0.0254] 
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:05<00:00,  2.99it/s, loss=0.0293] 
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:05<00:00,  2.98it/s, loss=0.0794


Model: answerdotai/ModernBERT-large | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:05<00:00,  2.94it/s, loss=1.02] 
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:05<00:00,  3.04it/s, loss=0.52] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:05<00:00,  2.87it/s, loss=0.289]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:05<00:00,  2.92it/s, loss=0.119]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:05<00:00,  2.92it/s, loss=0.172]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:05<00:00,  2.92it/s, loss=0.0594]
Fold 1 Epoch 7: 100%|██████████| 17/17 [00:05<00:00,  2.92it/s, loss=0.0387]
Fold 1 Epoch 8: 100%|██████████| 17/17 [00:05<00:00,  2.92it/s, loss=0.0327] 
Fold 1 Epoch 9: 100%|██████████| 17/17 [00:05<00:00,  2.92it/s, loss=0.0308]
F


Best combination saved to answerdotai-ModernBERT-large_label.txt

Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=1, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00, 16.70it/s, loss=1.12]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=3, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00, 16.53it/s, loss=1.17]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00, 16.81it/s, loss=0.965]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00, 16.73it/s, loss=0.822]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-fin


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=4, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00, 16.66it/s, loss=1.29]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00, 16.75it/s, loss=1.01] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00, 16.65it/s, loss=0.746]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:01<00:00, 16.70it/s, loss=0.609]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not 


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=9, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00, 16.93it/s, loss=1.14]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00, 16.97it/s, loss=0.88] 
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:01<00:00, 16.80it/s, loss=0.752]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:00<00:00, 17.16it/s, loss=0.59] 
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:00<00:00, 17.20it/s, loss=0.454]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:00<00:00, 17.19


Model: milyiyo/distilbert-base-uncased-finetuned-amazon-review | EPOCHS=12, BATCH_SIZE=8, K-FOLD=10


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at milyiyo/distilbert-base-uncased-finetuned-amazon-review and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 17/17 [00:01<00:00, 16.74it/s, loss=1.23]
Fold 1 Epoch 2: 100%|██████████| 17/17 [00:01<00:00, 16.95it/s, loss=0.966]
Fold 1 Epoch 3: 100%|██████████| 17/17 [00:00<00:00, 17.14it/s, loss=0.822]
Fold 1 Epoch 4: 100%|██████████| 17/17 [00:00<00:00, 17.09it/s, loss=0.637]
Fold 1 Epoch 5: 100%|██████████| 17/17 [00:00<00:00, 17.02it/s, loss=0.481]
Fold 1 Epoch 6: 100%|██████████| 17/17 [00:01<00:00, 16.96


Best combination saved to milyiyo-distilbert-base-uncased-finetuned-amazon-review_label.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Structure Focus

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import numpy as np
from datetime import datetime
import itertools
import os

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Constants and model list
MODEL_NAMES = [
    # "albert/albert-base-v2",
    # "albert/albert-large-v2",
    # "microsoft/deberta-v3-large",
    # "microsoft/deberta-v2-xlarge",
    # "FacebookAI/roberta-large",
    # "google-bert/bert-base-uncased",
    # "ProsusAI/finbert",
    # "PHILIPPUNI/distilbert-amazon-software-reviews-finetuned",
    # "justinlamlamlam/softwareengineering",
    # "answerdotai/ModernBERT-large",
    "milyiyo/distilbert-base-uncased-finetuned-amazon-review"
]
LABEL_COLUMN = "structure_focus"

# Hyperparameter grid
EPOCHS_LIST = [1, 3, 4, 9, 12]
BATCH_SIZES = [8]
N_SPLITS_LIST = [5, 10]

# Load data
data = pd.read_csv("type_classification-validation.csv")
label_encoder = LabelEncoder()
data[LABEL_COLUMN] = label_encoder.fit_transform(data[LABEL_COLUMN])
texts = data["sentence"].tolist()
labels = data[LABEL_COLUMN].tolist()

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Training loop
def train_model(model, train_loader, optimizer, criterion, epochs, fold):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}")
        for batch in progress_bar:
            optimizer.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

# Evaluation
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)
            preds = model(**inputs).logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

# Generate folds once and reuse them
def generate_kfold_splits(texts, labels, n_splits):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    return list(skf.split(texts, labels))

# Save best result to file
def save_best_result(log_path, header, result, best_combo_summary):
    with open(log_path, "w") as f:
        f.write(header + "\n\n")
        f.write(result + "\n")
        f.write("\n===== BEST COMBINATION =====\n")
        f.write(best_combo_summary + "\n")

# Begin experiment
for n_splits in N_SPLITS_LIST:
    folds = generate_kfold_splits(texts, labels, n_splits)

    for model_name in MODEL_NAMES:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        safe_model_name = model_name.replace('/', '-')
        log_file = f"{safe_model_name}_{LABEL_COLUMN}.txt"

        best_f1 = 0
        best_result = ""
        best_combo = ""

        for epochs, batch_size in itertools.product(EPOCHS_LIST, BATCH_SIZES):
            print(f"\nModel: {model_name} | EPOCHS={epochs}, BATCH_SIZE={batch_size}, K-FOLD={n_splits}")
            all_accuracies, all_precisions, all_recalls, all_f1s = [], [], [], []

            for fold, (train_idx, val_idx) in enumerate(folds):
                train_texts = [texts[i] for i in train_idx]
                val_texts = [texts[i] for i in val_idx]
                train_labels = [labels[i] for i in train_idx]
                val_labels = [labels[i] for i in val_idx]

                train_dataset = TextDataset(train_texts, train_labels, tokenizer)
                val_dataset = TextDataset(val_texts, val_labels, tokenizer)

                train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

                model = AutoModelForSequenceClassification.from_pretrained(
                    model_name,
                    num_labels=len(set(labels)),
                    ignore_mismatched_sizes=True
                ).to(device)

                optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
                criterion = torch.nn.CrossEntropyLoss()

                train_model(model, train_loader, optimizer, criterion, epochs, fold)
                y_true, y_pred = evaluate_model(model, val_loader)

                acc = accuracy_score(y_true, y_pred)
                prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

                all_accuracies.append(acc)
                all_precisions.append(prec)
                all_recalls.append(rec)
                all_f1s.append(f1)

            avg_accuracy = np.mean(all_accuracies)
            avg_precision = np.mean(all_precisions)
            avg_recall = np.mean(all_recalls)
            avg_f1 = np.mean(all_f1s)

            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_combo = f"Model: {model_name}\nLabel Column: {LABEL_COLUMN}\nEpochs: {epochs}, Batch Size: {batch_size}, K-Fold: {n_splits}"
                best_result = (
                    f"Accuracy: {avg_accuracy:.4f}\n"
                    f"Precision: {avg_precision:.4f}\n"
                    f"Recall: {avg_recall:.4f}\n"
                    f"F1-Score: {avg_f1:.4f}"
                )

        # Save best result to file
        header = f"Best Hyperparameter Combination for {model_name} on {LABEL_COLUMN}"
        save_best_result(log_file, header, best_result, best_combo)
        print(f"\nBest combination saved to {log_file}")


In [None]:
print("finish")