## Use Case Focus

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import numpy as np
from datetime import datetime
import itertools

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Model name
MODEL_NAME = "microsoft/deberta-v3-large"
LABEL_COLUMN = "label"
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_file = f"{MODEL_NAME.replace('/', '-')}_{LABEL_COLUMN}_{timestamp}.txt"

# Define hyperparameter grid
EPOCHS_LIST = [3, 4, 9, 12]
BATCH_SIZES = [8]
N_SPLITS_LIST = [5, 10]

# Load data
data = pd.read_csv('type_val_single.csv')
label_encoder = LabelEncoder()
data[LABEL_COLUMN] = label_encoder.fit_transform(data[LABEL_COLUMN])
texts = data['sentence'].tolist()
labels = data[LABEL_COLUMN].tolist()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def save_best_model(model, model_name, label_column, epochs, k_folds, batch_size):
    save_path = os.path.join(model_save_dir, f"{model_name}_{label_column}_epochs{epochs}_kfold{k_folds}_batch{batch_size}.bin")
    torch.save(model.state_dict(), save_path)
    print(f"Best model saved at {save_path}")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Training loop
def train_model(model, train_loader, optimizer, criterion, epochs, fold):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}")
        for batch in progress_bar:
            optimizer.zero_grad()
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

# Evaluation
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            preds = model(**inputs).logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

# Logging helper
def log_result(log_path, text):
    with open(log_path, "a") as f:
        f.write(text + "\n")

# Start hyperparameter tuning
best_f1 = 0
best_combo = ""
best_result = ""

for epochs, batch_size, n_splits in itertools.product(EPOCHS_LIST, BATCH_SIZES, N_SPLITS_LIST):
    print(f"\nTuning Combination: EPOCHS={epochs}, BATCH_SIZE={batch_size}, K-FOLD={n_splits}")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    all_accuracies, all_precisions, all_recalls, all_f1s = [], [], [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        train_texts = [texts[i] for i in train_idx]
        val_texts = [texts[i] for i in val_idx]
        train_labels = [labels[i] for i in train_idx]
        val_labels = [labels[i] for i in val_idx]

        train_dataset = TextDataset(train_texts, train_labels, tokenizer)
        val_dataset = TextDataset(val_texts, val_labels, tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(set(labels)), ignore_mismatched_sizes=True)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
        criterion = torch.nn.BCEWithLogitsLoss()

        train_model(model, train_loader, optimizer, criterion, epochs, fold)
        y_true, y_pred = evaluate_model(model, val_loader)

        acc = accuracy_score(y_true, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

        all_accuracies.append(acc)
        all_precisions.append(prec)
        all_recalls.append(rec)
        all_f1s.append(f1)

    avg_accuracy = np.mean(all_accuracies)
    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_f1 = np.mean(all_f1s)

    combo_string = f"Combination: EPOCHS={epochs}, BATCH_SIZE={batch_size}, K-FOLD={n_splits}"
    result_string = f"{combo_string}\naccuracy: {avg_accuracy:.4f}, precision: {avg_precision:.4f}, recall: {avg_recall:.4f}, f1-score: {avg_f1:.4f}\n"
    print(result_string)
    log_result(log_file, result_string)

    if avg_f1 > best_f1:
        best_f1 = avg_f1
        best_combo = combo_string
        best_result = result_string
        best_model = model
        best_epoch = epochs
        best_batch = batch_size
        best_fold = n_splits

# Log best combination at the end
footer = f"\nBest Combination:\n{best_result}"
log_result(log_file, footer)
print(footer)





Tuning Combination: EPOCHS=3, BATCH_SIZE=8, K-FOLD=5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1:   0%|          | 0/15 [00:00<?, ?it/s]


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 4]))

In [None]:
import os

# Directory to save the best models
model_save_dir = "all_model"
os.makedirs(model_save_dir, exist_ok=True)

if best_model is not None:
    save_best_model(best_model, MODEL_NAME.replace('/', '-'), LABEL_COLUMN, best_epoch, best_fold, best_batch)

In [None]:
data