# Deep Learning for Automated Essay Scoring

In [1]:
import os
import math
import time
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, AutoModel


In [2]:

def normalize_scores(scores, set_id, min_scores, max_scores):
    mi = min_scores[set_id-1]
    ma = max_scores[set_id-1]
    return (scores - mi) / (ma - mi)

def denormalize_scores(scores_norm, set_id, min_scores, max_scores):
    mi = min_scores[set_id-1]
    ma = max_scores[set_id-1]
    return scores_norm * (ma - mi) + mi

In [32]:
def get_encoder(model_name, unfreeze_last_n=0):
    if model_name == "roberta":
        name = "roberta-base"
    elif model_name == "bge":
        name = "BAAI/bge-base-en"
    elif model_name == "e5-base":
        name = "intfloat/e5-base-v2"
    elif model_name == "e5-large":
        name = "intfloat/e5-large"
    elif model_name == "qwen":
        name = "Qwen/Qwen3-Embedding-0.6B"   # qwen 0.6B close checkpoint
    elif model_name == "deberta":
        name = "microsoft/deberta-v3-base"
    elif model_name=="contrastive1":
        name="/kaggle/working/contrastive_encoders/prompt_1"
    elif model_name=="contrastive2":
        name="/kaggle/working/contrastive_encoders/prompt_2"
    elif model_name=="contrastive3":
        name="/kaggle/working/contrastive_encoders/prompt_3"
    elif model_name=="contrastive4":
        name="/kaggle/working/contrastive_encoders/prompt_4"
    elif model_name=="contrastive5":
        name="/kaggle/working/contrastive_encoders/prompt_5"
    elif model_name=="contrastive6":
        name="/kaggle/working/contrastive_encoders/prompt_6"
    elif model_name=="contrastive7":
        name="/kaggle/working/contrastive_encoders/prompt_7"
    elif model_name=="contrastive8":
        name="/kaggle/working/contrastive_encoders/prompt_8"
    elif model_name=="contrastive":
        name="/kaggle/working/contrastive_encoders/final_encoder"
        
    else:
        raise ValueError("Unknown model")
    
    tokenizer = AutoTokenizer.from_pretrained(name)
    encoder = AutoModel.from_pretrained(name)
    hidden_size = encoder.config.hidden_size

    # Freeze all params
    #for param in encoder.parameters():
        #param.requires_grad = False

    # Unfreeze last `unfreeze_last_n` layers if requested
    #if hasattr(encoder, "encoder"):  # works for RoBERTa/E5/BGE
    layers = encoder.encoder.layer
    for layer in layers[-unfreeze_last_n:]:
        for param in layer.parameters():
            param.requires_grad = True
    #elif hasattr(encoder, "model"):  # some Qwen variants wrap transformer under .model
        #if hasattr(encoder.model, "layers"):
            #layers = encoder.model.layers
            #for layer in layers[-unfreeze_last_n:]:
                #for param in layer.parameters():
                   # param.requires_grad = True

    return encoder, tokenizer, hidden_size


In [6]:
file_path = '/kaggle/input/asapaes/training_set_rel3_cleaned.tsv'
columns = ['essay_id', 'essay_set', 'essay', 'domain1_score']
asap = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1', usecols=columns)
min_scores = [int(asap[asap["essay_set"] == s]["domain1_score"].min()) for s in range(1, 9)]
max_scores = [int(asap[asap["essay_set"] == s]["domain1_score"].max()) for s in range(1, 9)]
asap.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [7]:
import pandas as pd
import numpy as np

def make_pairs(df, set_id, min_scores, max_scores, threshold=0.1, n_pairs=5000, seed=42):
    np.random.seed(seed)
    essays = df[df["essay_set"] == set_id].reset_index(drop=True)

    # normalize scores
    essays["norm_score"] = normalize_scores(
        essays["domain1_score"].values, set_id, min_scores, max_scores
    )

    pairs = []
    while len(pairs) < n_pairs:
        i, j = np.random.choice(len(essays), 2, replace=False)
        s1, s2 = essays.loc[i, "norm_score"], essays.loc[j, "norm_score"]

        if abs(s1 - s2) <= threshold:
            pairs.append({
                "essay1": essays.loc[i, "essay"],
                "essay2": essays.loc[j, "essay"],
                "score1": s1,
                "score2": s2
            })

    return pd.DataFrame(pairs)




In [8]:
from torch.utils.data import Dataset, DataLoader
import torch

class EssayPairDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_len=256):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        row = self.pairs.iloc[idx]

        enc1 = self.tokenizer(
            row["essay1"], truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )
        enc2 = self.tokenizer(
            row["essay2"], truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )

        return {
            "input_ids1": enc1["input_ids"].squeeze(0),
            "attention_mask1": enc1["attention_mask"].squeeze(0),
            "input_ids2": enc2["input_ids"].squeeze(0),
            "attention_mask2": enc2["attention_mask"].squeeze(0),
            "score1": torch.tensor(row["score1"], dtype=torch.float),
            "score2": torch.tensor(row["score2"], dtype=torch.float)
        }



def get_pair_dataloader(pairs, tokenizer, batch_size=16, max_len=256, shuffle=True):
    dataset = EssayPairDataset(pairs, tokenizer, max_len=max_len)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)



In [9]:


class ContrastiveModel(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder  # pretrained LM

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS]
        return cls_embedding


def info_nce_loss(emb1, emb2, labels, temperature=0.07, threshold=0.1):
    """
    Supervised InfoNCE loss for AES.
    
    emb1, emb2: [batch, hidden_dim]
    labels: [batch] normalized scores (0-1 per essay)
    threshold: max difference in scores to consider essays as 'similar'
    """
    # normalize embeddings
    emb1 = F.normalize(emb1, dim=-1)
    emb2 = F.normalize(emb2, dim=-1)

    # similarity matrix (cosine)
    logits = torch.matmul(emb1, emb2.T) / temperature  # [batch, batch]

    # expand labels to pairwise matrix
    diff = torch.abs(labels.unsqueeze(1) - labels.unsqueeze(0))  # [batch, batch]

    # positive mask: 1 if within threshold, 0 otherwise
    positive_mask = (diff <= threshold).float()
    # remove self-similarity
    positive_mask.fill_diagonal_(0)

    # log-softmax over rows
    log_probs = F.log_softmax(logits, dim=1)

    # supervised contrastive loss: average log-prob over positives
    numerator = (positive_mask * log_probs).sum(dim=1)
    denominator = positive_mask.sum(dim=1).clamp(min=1)  # avoid divide by zero
    loss = -(numerator / denominator).mean()

    return loss




In [10]:
from tqdm import tqdm

def train_contrastive(encoder, pair_loader, optimizer, device, num_epochs=3):
    encoder.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch in tqdm(pair_loader, desc=f"Contrastive Epoch {epoch+1}"):
            input1 = {
                "input_ids": batch["input_ids1"].to(device),
                "attention_mask": batch["attention_mask1"].to(device)
            }
            input2 = {
                "input_ids": batch["input_ids2"].to(device),
                "attention_mask": batch["attention_mask2"].to(device)
            }
            labels = torch.stack([batch["score1"], batch["score2"]], dim=1).to(device)

            emb1 = encoder(**input1).last_hidden_state[:, 0, :]
            emb2 = encoder(**input2).last_hidden_state[:, 0, :]

            # Use score1 as "labels" (since each emb1 corresponds to essay1)
            loss = info_nce_loss(emb1, emb2, batch["score1"].to(device))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(pair_loader)
        print(f"Epoch {epoch+1}, Avg Loss={avg_loss:.4f}")



In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
encoder = AutoModel.from_pretrained("roberta-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
optimizer = torch.optim.AdamW(encoder.parameters(), lr=2e-5)



save_dir = "contrastive_encoders"
os.makedirs(save_dir, exist_ok=True)

for set_id in range(1, 9):
    print(f"\n=== Training on Prompt {set_id} ===")

    # make pairs for this prompt
    pairs_df = make_pairs(
        df=asap,
        set_id=set_id,
        min_scores=min_scores,
        max_scores=max_scores,
        threshold=0.1,
        n_pairs=3000
    )

    # dataloader
    pair_loader = get_pair_dataloader(
        pairs=pairs_df,
        tokenizer=tokenizer,
        batch_size=16,
        max_len=256
    )

    # train
    train_contrastive(
        encoder=encoder,
        pair_loader=pair_loader,
        optimizer=optimizer,
        device=device,
        num_epochs=3
    )

    # save encoder after each prompt
    encoder.save_pretrained(f"{save_dir}/prompt_{set_id}")
    tokenizer.save_pretrained(f"{save_dir}/prompt_{set_id}")

# final save (after all prompts)
encoder.save_pretrained(f"{save_dir}/final_encoder")
tokenizer.save_pretrained(f"{save_dir}/final_encoder")
print("\n✅ Encoder saved successfully!")



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

2025-09-12 20:45:46.953094: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757709947.333079      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757709947.440803      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


=== Training on Prompt 1 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [04:55<00:00,  1.57s/it]


Epoch 1, Avg Loss=2.6636


Contrastive Epoch 2: 100%|██████████| 188/188 [05:08<00:00,  1.64s/it]


Epoch 2, Avg Loss=2.6516


Contrastive Epoch 3: 100%|██████████| 188/188 [05:08<00:00,  1.64s/it]


Epoch 3, Avg Loss=2.6399

=== Training on Prompt 2 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [05:10<00:00,  1.65s/it]


Epoch 1, Avg Loss=2.6356


Contrastive Epoch 2: 100%|██████████| 188/188 [05:10<00:00,  1.65s/it]


Epoch 2, Avg Loss=2.3179


Contrastive Epoch 3: 100%|██████████| 188/188 [05:09<00:00,  1.65s/it]


Epoch 3, Avg Loss=2.1268

=== Training on Prompt 3 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [04:48<00:00,  1.54s/it]


Epoch 1, Avg Loss=2.6275


Contrastive Epoch 2: 100%|██████████| 188/188 [04:49<00:00,  1.54s/it]


Epoch 2, Avg Loss=2.4496


Contrastive Epoch 3: 100%|██████████| 188/188 [04:48<00:00,  1.54s/it]


Epoch 3, Avg Loss=2.1484

=== Training on Prompt 4 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [04:46<00:00,  1.52s/it]


Epoch 1, Avg Loss=2.3878


Contrastive Epoch 2: 100%|██████████| 188/188 [04:46<00:00,  1.53s/it]


Epoch 2, Avg Loss=2.1308


Contrastive Epoch 3: 100%|██████████| 188/188 [04:47<00:00,  1.53s/it]


Epoch 3, Avg Loss=2.0092

=== Training on Prompt 5 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [04:51<00:00,  1.55s/it]


Epoch 1, Avg Loss=2.4524


Contrastive Epoch 2: 100%|██████████| 188/188 [04:50<00:00,  1.55s/it]


Epoch 2, Avg Loss=2.1837


Contrastive Epoch 3: 100%|██████████| 188/188 [04:50<00:00,  1.54s/it]


Epoch 3, Avg Loss=1.9856

=== Training on Prompt 6 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [04:55<00:00,  1.57s/it]


Epoch 1, Avg Loss=2.5972


Contrastive Epoch 2: 100%|██████████| 188/188 [04:56<00:00,  1.58s/it]


Epoch 2, Avg Loss=2.4686


Contrastive Epoch 3: 100%|██████████| 188/188 [04:55<00:00,  1.57s/it]


Epoch 3, Avg Loss=2.3506

=== Training on Prompt 7 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [04:56<00:00,  1.58s/it]


Epoch 1, Avg Loss=2.6201


Contrastive Epoch 2: 100%|██████████| 188/188 [04:56<00:00,  1.58s/it]


Epoch 2, Avg Loss=2.5234


Contrastive Epoch 3: 100%|██████████| 188/188 [04:57<00:00,  1.58s/it]


Epoch 3, Avg Loss=2.4249

=== Training on Prompt 8 ===


Contrastive Epoch 1: 100%|██████████| 188/188 [05:15<00:00,  1.68s/it]


Epoch 1, Avg Loss=2.7366


Contrastive Epoch 2: 100%|██████████| 188/188 [05:15<00:00,  1.68s/it]


Epoch 2, Avg Loss=2.6920


Contrastive Epoch 3: 100%|██████████| 188/188 [05:15<00:00,  1.68s/it]


Epoch 3, Avg Loss=2.6666

✅ Encoder saved successfully!


In [19]:
class EssayDataset(Dataset):
    def __init__(self, texts, scores, tokenizer, max_len=512, embedder_name=None):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.embedder_name = embedder_name  # keep track of which model we use

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        score = self.scores[idx]

        # If using E5, prepend "passage: " to the essay
        if self.embedder_name and "e5" in self.embedder_name.lower():
            text = "passage: " + text

        enc = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "score": torch.tensor(score, dtype=torch.float)
        }


In [20]:
class EssayRegressor(nn.Module):
    def __init__(self, encoder, hidden_size):
        super().__init__()
        self.encoder = encoder
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        return self.mlp(cls_embedding)


In [21]:
from sklearn.model_selection import train_test_split

def split_dataset(prompt, test_size=0.2, val_size=0.2, seed=42):
    df_prompt = asap[asap["essay_set"] == prompt].copy()

    train_val, test = train_test_split(df_prompt, test_size=test_size, random_state=seed)
    train, val = train_test_split(train_val, test_size=val_size/(1-test_size), random_state=seed)

    return train, val, test

from torch.utils.data import DataLoader

def get_dataloaders(train_df, val_df, test_df, tokenizer, prompt,
                    batch_size=16, max_len=512):
    """
    Build PyTorch dataloaders for a prompt (expects pandas DataFrames).
    Scores are normalized with normalize_scores before being passed to the Dataset.
    """
    # Normalize arrays (vectorized)
    train_scores_norm = normalize_scores(train_df["domain1_score"].values, prompt, min_scores, max_scores)
    val_scores_norm   = normalize_scores(val_df["domain1_score"].values,   prompt, min_scores, max_scores)
    test_scores_norm  = normalize_scores(test_df["domain1_score"].values,  prompt, min_scores, max_scores)

    train_dataset = EssayDataset(train_df["essay"].values, train_scores_norm, tokenizer, max_len=max_len)
    val_dataset   = EssayDataset(val_df["essay"].values,   val_scores_norm,   tokenizer, max_len=max_len)
    test_dataset  = EssayDataset(test_df["essay"].values,  test_scores_norm,  tokenizer, max_len=max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader





In [22]:
import torch
import torch.optim as optim
from tqdm import tqdm

def train_all_prompts(embedder,
                      prompts=range(1,9),
                      num_epochs=10,
                      batch_size=16,
                      lr=2e-5,
                      patience=3,
                      max_len=512,
                      device=None):
    """
    Loop over essay sets (prompts) and train separately for each.
    Saves results to results.csv (train_and_evaluate does that).
    Returns: pandas DataFrame summarizing returns from results.csv for the embedder run.
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    summary = []

    for prompt in prompts:
        print("\n" + "="*30)
        print(f" Training Essay Set {prompt} with embedder '{embedder}'")
        print("="*30)

        # 1) get encoder & tokenizer for this embedder
        encoder, tokenizer, hidden_size = get_encoder(embedder)

        # 2) split dataset for this prompt
        train_df, val_df, test_df = split_dataset(prompt, test_size=0.2, val_size=0.2, seed=42)

        # 3) dataloaders (scores normalized inside)
        train_loader, val_loader, test_loader = get_dataloaders(train_df, val_df, test_df,
                                                                tokenizer, prompt,
                                                                batch_size=batch_size, max_len=max_len)

        # 4) model, optimizer, criterion
        model = EssayRegressor(encoder, hidden_size)
        optimizer = optim.AdamW(model.parameters(), lr=lr)
        criterion = nn.MSELoss()

        # 5) train/evaluate for this prompt
        train_losses, val_losses, val_qwks = train_and_evaluate(
            model, train_loader, val_loader, test_loader,
            optimizer, criterion, device, prompt, embedder,
            num_epochs, patience
        )

        # 6) read the last row for this prompt from results.csv (optional) and append to summary
        try:
            df_results = pd.read_csv("results.csv")
            df_prompt = df_results[(df_results["embedder"]==embedder) & (df_results["prompt"]==prompt)]
            if not df_prompt.empty:
                row = df_prompt.iloc[-1].to_dict()
            else:
                row = {"embedder": embedder, "prompt": prompt, "best_val_qwk": None, "best_val_mse": None, "test_qwk": None, "test_mse": None}
        except FileNotFoundError:
            row = {"embedder": embedder, "prompt": prompt, "best_val_qwk": None, "best_val_mse": None, "test_qwk": None, "test_mse": None}

        row["train_losses"] = train_losses
        row["val_losses"] = val_losses
        row["val_qwks"] = val_qwks
        summary.append(row)

    summary_df = pd.DataFrame(summary)
    return summary_df


In [23]:
import os
import pandas as pd
from tqdm import tqdm

def train_and_evaluate(model, train_loader, val_loader, test_loader, 
                       optimizer, criterion, device, prompt, embedder, 
                       num_epochs=10, patience=3):

    model = model.to(device)
    best_val_qwk = -1.0
    best_val_mse = float("inf")
    patience_counter = 0

    train_losses, val_losses, val_qwks = [], [], []

    for epoch in range(1, num_epochs+1):
        model.train()
        running_loss = 0.0

        # Training loop with progress bar
        progress_bar = tqdm(train_loader, desc=f"Prompt {prompt} Epoch {epoch}", leave=False)
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            scores = batch["score"].to(device).unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, scores)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        avg_train_loss = running_loss / len(train_loader)

        # Validation
        val_qwk, val_mse = evaluate(model, val_loader, criterion, prompt, device)
        train_losses.append(avg_train_loss)
        val_losses.append(val_mse)
        val_qwks.append(val_qwk)

        print(f"Prompt {prompt}, Epoch {epoch}: "
              f"Train Loss={avg_train_loss:.4f}, "
              f"Val QWK={val_qwk:.4f}, Val MSE={val_mse:.4f}")

        # Early stopping
        if val_qwk > best_val_qwk:
            best_val_qwk = val_qwk
            best_val_mse = val_mse
            patience_counter = 0
            torch.save(model.state_dict(), f"best_model_{embedder}_prompt{prompt}.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    # Load best model before testing
    model.load_state_dict(torch.load(f"best_model_{embedder}_prompt{prompt}.pt"))

    # Final test evaluation
    test_qwk, test_mse = evaluate(model, test_loader, criterion, prompt, device)
    print(f"✅ Prompt {prompt} | Test QWK={test_qwk:.4f}, Test MSE={test_mse:.4f}")

    # Save results to CSV
    results_df = pd.DataFrame([{
        "embedder": embedder,
        "prompt": prompt,
        "best_val_qwk": best_val_qwk,
        "best_val_mse": best_val_mse,
        "test_qwk": test_qwk,
        "test_mse": test_mse
    }])

    results_df.to_csv("results.csv", mode="a", header=not os.path.exists("results.csv"), index=False)

    return train_losses, val_losses, val_qwks


In [24]:
from sklearn.metrics import cohen_kappa_score, mean_squared_error
import numpy as np

def evaluate(model, loader, criterion, prompt, device):
    """
    Evaluate model on loader.
    Returns: (qwk_on_raw_scale, mse_on_raw_scale)
    Note: loader should supply normalized scores (as used during training).
    """
    model.eval()
    all_preds_norm, all_labels_norm = [], []
    total_loss = 0.0
    n_batches = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            scores = batch["score"].to(device).unsqueeze(1)    # normalized scores

            outputs = model(input_ids, attention_mask)        # normalized-pred outputs
            loss = criterion(outputs, scores)
            total_loss += loss.item()
            n_batches += 1

            # Collect as numpy arrays
            all_preds_norm.extend(outputs.squeeze(1).cpu().numpy())
            all_labels_norm.extend(scores.squeeze(1).cpu().numpy())

    if len(all_preds_norm) == 0:
        return 0.0, 0.0  # safe fallback

    # Convert to numpy arrays
    all_preds_norm = np.array(all_preds_norm, dtype=float).flatten()
    all_labels_norm = np.array(all_labels_norm, dtype=float).flatten()

    # Denormalize both (raw score scale)
    preds_raw = denormalize_scores(all_preds_norm, prompt, min_scores, max_scores)
    labels_raw = denormalize_scores(all_labels_norm, prompt, min_scores, max_scores)

    # Round predictions to nearest integer and clip to valid range for QWK
    low, high = min_scores[prompt-1], max_scores[prompt-1]
    preds_rounded = np.clip(np.rint(preds_raw), low, high).astype(int)
    labels_int = labels_raw.astype(int)

    # QWK (on integer original-score scale) and raw-scale MSE
    try:
        qwk = cohen_kappa_score(labels_int, preds_rounded, weights="quadratic")
    except Exception:
        qwk = 0.0

    mse_raw = mean_squared_error(labels_raw, preds_raw)

    avg_loss = total_loss / n_batches if n_batches > 0 else 0.0
    return qwk, mse_raw


In [None]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_bge= train_all_prompts(
    embedder="bge",
    prompts=range(1,9),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_bge[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


In [None]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_e5base = train_all_prompts(
    embedder="e5-base",
    prompts=range(1,9),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_e5base[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


In [None]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="deberta",
    prompts=range(1,9),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])

In [25]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive",
    prompts=range(1,9),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 1 with embedder 'contrastive'


                                                                                 

Prompt 1, Epoch 1: Train Loss=0.0725, Val QWK=0.5919, Val MSE=1.2899


                                                                                 

Prompt 1, Epoch 2: Train Loss=0.0320, Val QWK=0.6287, Val MSE=1.3046


                                                                                

Prompt 1, Epoch 3: Train Loss=0.0335, Val QWK=0.6099, Val MSE=1.2648


                                                                                 

Prompt 1, Epoch 4: Train Loss=0.0333, Val QWK=0.5970, Val MSE=1.3672


                                                                                 

Prompt 1, Epoch 5: Train Loss=0.0311, Val QWK=0.6166, Val MSE=1.4653
Early stopping triggered.
✅ Prompt 1 | Test QWK=0.6035, Test MSE=1.2566

 Training Essay Set 2 with embedder 'contrastive'


                                                                                 

Prompt 2, Epoch 1: Train Loss=0.0609, Val QWK=0.7411, Val MSE=0.2400


                                                                                 

Prompt 2, Epoch 2: Train Loss=0.0271, Val QWK=0.7286, Val MSE=0.2378


                                                                                 

Prompt 2, Epoch 3: Train Loss=0.0240, Val QWK=0.7393, Val MSE=0.2691


                                                                                 

Prompt 2, Epoch 4: Train Loss=0.0225, Val QWK=0.7488, Val MSE=0.2451


                                                                                 

Prompt 2, Epoch 5: Train Loss=0.0220, Val QWK=0.7066, Val MSE=0.2996


                                                                                 

Prompt 2, Epoch 6: Train Loss=0.0204, Val QWK=0.7437, Val MSE=0.2561


                                                                                 

Prompt 2, Epoch 7: Train Loss=0.0208, Val QWK=0.7427, Val MSE=0.2340
Early stopping triggered.
✅ Prompt 2 | Test QWK=0.6824, Test MSE=0.2704

 Training Essay Set 3 with embedder 'contrastive'


                                                                                

Prompt 3, Epoch 1: Train Loss=0.1089, Val QWK=0.6744, Val MSE=0.2719


                                                                                 

Prompt 3, Epoch 2: Train Loss=0.0567, Val QWK=0.7165, Val MSE=0.2557


                                                                                 

Prompt 3, Epoch 3: Train Loss=0.0552, Val QWK=0.7044, Val MSE=0.2587


                                                                                

Prompt 3, Epoch 4: Train Loss=0.0594, Val QWK=0.7167, Val MSE=0.2494


                                                                                

Prompt 3, Epoch 5: Train Loss=0.0533, Val QWK=0.7048, Val MSE=0.2541


                                                                                

Prompt 3, Epoch 6: Train Loss=0.0494, Val QWK=0.7237, Val MSE=0.2440


                                                                                

Prompt 3, Epoch 7: Train Loss=0.0490, Val QWK=0.7330, Val MSE=0.2418


                                                                                

Prompt 3, Epoch 8: Train Loss=0.0524, Val QWK=0.7342, Val MSE=0.2412


                                                                                

Prompt 3, Epoch 9: Train Loss=0.0453, Val QWK=0.7439, Val MSE=0.2371


                                                                                 

Prompt 3, Epoch 10: Train Loss=0.0463, Val QWK=0.7196, Val MSE=0.2353
✅ Prompt 3 | Test QWK=0.7080, Test MSE=0.2781

 Training Essay Set 4 with embedder 'contrastive'


                                                                                

Prompt 4, Epoch 1: Train Loss=0.1211, Val QWK=0.6828, Val MSE=0.3418


                                                                                 

Prompt 4, Epoch 2: Train Loss=0.0535, Val QWK=0.7762, Val MSE=0.3057


                                                                                

Prompt 4, Epoch 3: Train Loss=0.0508, Val QWK=0.7652, Val MSE=0.3067


                                                                                

Prompt 4, Epoch 4: Train Loss=0.0484, Val QWK=0.7829, Val MSE=0.2913


                                                                                

Prompt 4, Epoch 5: Train Loss=0.0455, Val QWK=0.7851, Val MSE=0.2846


                                                                                

Prompt 4, Epoch 6: Train Loss=0.0456, Val QWK=0.7852, Val MSE=0.2825


                                                                                 

Prompt 4, Epoch 7: Train Loss=0.0461, Val QWK=0.7781, Val MSE=0.2900


                                                                                

Prompt 4, Epoch 8: Train Loss=0.0449, Val QWK=0.7764, Val MSE=0.2882


                                                                                

Prompt 4, Epoch 9: Train Loss=0.0460, Val QWK=0.7741, Val MSE=0.2663
Early stopping triggered.
✅ Prompt 4 | Test QWK=0.7746, Test MSE=0.3080

 Training Essay Set 5 with embedder 'contrastive'


                                                                                

Prompt 5, Epoch 1: Train Loss=0.0974, Val QWK=0.6632, Val MSE=0.4102


                                                                                

Prompt 5, Epoch 2: Train Loss=0.0395, Val QWK=0.6841, Val MSE=0.4638


                                                                                

Prompt 5, Epoch 3: Train Loss=0.0359, Val QWK=0.7012, Val MSE=0.4218


                                                                                 

Prompt 5, Epoch 4: Train Loss=0.0344, Val QWK=0.7405, Val MSE=0.3886


                                                                                 

Prompt 5, Epoch 5: Train Loss=0.0325, Val QWK=0.7360, Val MSE=0.3439


                                                                                 

Prompt 5, Epoch 6: Train Loss=0.0337, Val QWK=0.7521, Val MSE=0.3579


                                                                                 

Prompt 5, Epoch 7: Train Loss=0.0334, Val QWK=0.7534, Val MSE=0.3285


                                                                                

Prompt 5, Epoch 8: Train Loss=0.0309, Val QWK=0.7697, Val MSE=0.3136


                                                                                 

Prompt 5, Epoch 9: Train Loss=0.0290, Val QWK=0.7833, Val MSE=0.3065


                                                                                  

Prompt 5, Epoch 10: Train Loss=0.0295, Val QWK=0.7775, Val MSE=0.3179
✅ Prompt 5 | Test QWK=0.7734, Test MSE=0.2902

 Training Essay Set 6 with embedder 'contrastive'


                                                                                 

Prompt 6, Epoch 1: Train Loss=0.0722, Val QWK=0.6450, Val MSE=0.4465


                                                                                

Prompt 6, Epoch 2: Train Loss=0.0436, Val QWK=0.7190, Val MSE=0.4492


                                                                                

Prompt 6, Epoch 3: Train Loss=0.0463, Val QWK=0.7039, Val MSE=0.4562


                                                                                 

Prompt 6, Epoch 4: Train Loss=0.0407, Val QWK=0.7256, Val MSE=0.4607


                                                                                 

Prompt 6, Epoch 5: Train Loss=0.0388, Val QWK=0.7055, Val MSE=0.4501


                                                                                

Prompt 6, Epoch 6: Train Loss=0.0404, Val QWK=0.7256, Val MSE=0.4098


                                                                                

Prompt 6, Epoch 7: Train Loss=0.0379, Val QWK=0.7287, Val MSE=0.4267


                                                                                

Prompt 6, Epoch 8: Train Loss=0.0409, Val QWK=0.7354, Val MSE=0.3975


                                                                                 

Prompt 6, Epoch 9: Train Loss=0.0363, Val QWK=0.7176, Val MSE=0.4219


                                                                                  

Prompt 6, Epoch 10: Train Loss=0.0348, Val QWK=0.7098, Val MSE=0.4221
✅ Prompt 6 | Test QWK=0.7672, Test MSE=0.3254

 Training Essay Set 7 with embedder 'contrastive'


                                                                                

Prompt 7, Epoch 1: Train Loss=0.1105, Val QWK=0.7788, Val MSE=6.1867


                                                                                 

Prompt 7, Epoch 2: Train Loss=0.0310, Val QWK=0.8170, Val MSE=5.9922


                                                                                 

Prompt 7, Epoch 3: Train Loss=0.0294, Val QWK=0.8245, Val MSE=5.8221


                                                                                 

Prompt 7, Epoch 4: Train Loss=0.0275, Val QWK=0.8284, Val MSE=6.2256


                                                                                 

Prompt 7, Epoch 5: Train Loss=0.0283, Val QWK=0.8244, Val MSE=5.8759


                                                                                 

Prompt 7, Epoch 6: Train Loss=0.0278, Val QWK=0.8347, Val MSE=5.6759


                                                                                 

Prompt 7, Epoch 7: Train Loss=0.0257, Val QWK=0.8307, Val MSE=5.7506


                                                                                 

Prompt 7, Epoch 8: Train Loss=0.0234, Val QWK=0.8376, Val MSE=5.4858


                                                                                 

Prompt 7, Epoch 9: Train Loss=0.0264, Val QWK=0.8324, Val MSE=5.9653


                                                                                  

Prompt 7, Epoch 10: Train Loss=0.0220, Val QWK=0.8388, Val MSE=5.4701
✅ Prompt 7 | Test QWK=0.8916, Test MSE=4.4838

 Training Essay Set 8 with embedder 'contrastive'


                                                                               

Prompt 8, Epoch 1: Train Loss=0.0596, Val QWK=0.7813, Val MSE=13.2335


                                                                               

Prompt 8, Epoch 2: Train Loss=0.0228, Val QWK=0.8552, Val MSE=12.1525


                                                                                

Prompt 8, Epoch 3: Train Loss=0.0195, Val QWK=0.8626, Val MSE=11.3201


                                                                               

Prompt 8, Epoch 4: Train Loss=0.0172, Val QWK=0.8466, Val MSE=11.1525


                                                                               

Prompt 8, Epoch 5: Train Loss=0.0177, Val QWK=0.8702, Val MSE=11.2592


                                                                               

Prompt 8, Epoch 6: Train Loss=0.0197, Val QWK=0.8736, Val MSE=11.4380


                                                                                

Prompt 8, Epoch 7: Train Loss=0.0165, Val QWK=0.8775, Val MSE=11.8520


                                                                               

Prompt 8, Epoch 8: Train Loss=0.0168, Val QWK=0.8823, Val MSE=10.1371


                                                                               

Prompt 8, Epoch 9: Train Loss=0.0160, Val QWK=0.8769, Val MSE=9.7714


                                                                                

Prompt 8, Epoch 10: Train Loss=0.0151, Val QWK=0.8875, Val MSE=9.9836
✅ Prompt 8 | Test QWK=0.8834, Test MSE=6.6007
      embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive       1      0.628725  0.603490  1.256645
1  contrastive       2      0.748780  0.682426  0.270378
2  contrastive       3      0.743886  0.707992  0.278055
3  contrastive       4      0.785219  0.774605  0.308026
4  contrastive       5      0.783263  0.773410  0.290208
5  contrastive       6      0.735418  0.767184  0.325438
6  contrastive       7      0.838842  0.891631  4.483780
7  contrastive       8      0.887484  0.883411  6.600673


In [33]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive1",
    prompts=range(1,2),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 1 with embedder 'contrastive1'


                                                                                 

Prompt 1, Epoch 1: Train Loss=0.0369, Val QWK=0.6480, Val MSE=1.0220


                                                                                 

Prompt 1, Epoch 2: Train Loss=0.0182, Val QWK=0.6922, Val MSE=0.9805


                                                                                 

Prompt 1, Epoch 3: Train Loss=0.0128, Val QWK=0.7172, Val MSE=0.9133


                                                                                 

Prompt 1, Epoch 4: Train Loss=0.0102, Val QWK=0.7998, Val MSE=0.8141


                                                                                 

Prompt 1, Epoch 5: Train Loss=0.0083, Val QWK=0.6203, Val MSE=1.6266


                                                                                 

Prompt 1, Epoch 6: Train Loss=0.0080, Val QWK=0.6820, Val MSE=1.0902


                                                                                 

Prompt 1, Epoch 7: Train Loss=0.0080, Val QWK=0.6565, Val MSE=1.2488
Early stopping triggered.
✅ Prompt 1 | Test QWK=0.7849, Test MSE=0.7644
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive1       1      0.799799  0.784902  0.764367


In [34]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive2",
    prompts=range(2,3),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 2 with embedder 'contrastive2'


                                                                                 

Prompt 2, Epoch 1: Train Loss=0.0219, Val QWK=0.8249, Val MSE=0.2077


                                                                                 

Prompt 2, Epoch 2: Train Loss=0.0118, Val QWK=0.8636, Val MSE=0.1664


                                                                                 

Prompt 2, Epoch 3: Train Loss=0.0080, Val QWK=0.8426, Val MSE=0.1849


                                                                                 

Prompt 2, Epoch 4: Train Loss=0.0074, Val QWK=0.8740, Val MSE=0.1576


                                                                                  

Prompt 2, Epoch 5: Train Loss=0.0070, Val QWK=0.8770, Val MSE=0.1388


                                                                                 

Prompt 2, Epoch 6: Train Loss=0.0057, Val QWK=0.8374, Val MSE=0.1884


                                                                                 

Prompt 2, Epoch 7: Train Loss=0.0048, Val QWK=0.8697, Val MSE=0.1878


                                                                                  

Prompt 2, Epoch 8: Train Loss=0.0044, Val QWK=0.8732, Val MSE=0.1491
Early stopping triggered.
✅ Prompt 2 | Test QWK=0.8522, Test MSE=0.1447
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive2       2      0.876996  0.852208  0.144653


In [35]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive3",
    prompts=range(3,4),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 3 with embedder 'contrastive3'


                                                                                 

Prompt 3, Epoch 1: Train Loss=0.0444, Val QWK=0.9024, Val MSE=0.1264


                                                                                 

Prompt 3, Epoch 2: Train Loss=0.0222, Val QWK=0.9134, Val MSE=0.1019


                                                                                 

Prompt 3, Epoch 3: Train Loss=0.0160, Val QWK=0.9121, Val MSE=0.1043


                                                                                 

Prompt 3, Epoch 4: Train Loss=0.0112, Val QWK=0.9123, Val MSE=0.1031


                                                                                 

Prompt 3, Epoch 5: Train Loss=0.0099, Val QWK=0.8993, Val MSE=0.1349
Early stopping triggered.
✅ Prompt 3 | Test QWK=0.9342, Test MSE=0.0868
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive3       3      0.913356  0.934164  0.086796


In [36]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive4",
    prompts=range(4,5),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 4 with embedder 'contrastive4'


                                                                                 

Prompt 4, Epoch 1: Train Loss=0.0275, Val QWK=0.9341, Val MSE=0.1044


                                                                                 

Prompt 4, Epoch 2: Train Loss=0.0160, Val QWK=0.9128, Val MSE=0.1258


                                                                                 

Prompt 4, Epoch 3: Train Loss=0.0119, Val QWK=0.9215, Val MSE=0.1384


                                                                                  

Prompt 4, Epoch 4: Train Loss=0.0083, Val QWK=0.9116, Val MSE=0.1196
Early stopping triggered.
✅ Prompt 4 | Test QWK=0.9021, Test MSE=0.1398
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive4       4      0.934116  0.902087  0.139788


In [38]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive5",
    prompts=range(5,6),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 5 with embedder 'contrastive5'


                                                                                 

Prompt 5, Epoch 1: Train Loss=0.0322, Val QWK=0.9133, Val MSE=0.1596


                                                                                 

Prompt 5, Epoch 2: Train Loss=0.0118, Val QWK=0.9396, Val MSE=0.1022


                                                                                 

Prompt 5, Epoch 3: Train Loss=0.0091, Val QWK=0.9237, Val MSE=0.1328


                                                                                 

Prompt 5, Epoch 4: Train Loss=0.0073, Val QWK=0.8541, Val MSE=0.2532


                                                                                 

Prompt 5, Epoch 5: Train Loss=0.0072, Val QWK=0.8956, Val MSE=0.2073
Early stopping triggered.
✅ Prompt 5 | Test QWK=0.9268, Test MSE=0.0992
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive5       5      0.939646  0.926842  0.099158


In [39]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive6",
    prompts=range(6,7),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 6 with embedder 'contrastive6'


                                                                                 

Prompt 6, Epoch 1: Train Loss=0.0464, Val QWK=0.8469, Val MSE=0.2192


                                                                                 

Prompt 6, Epoch 2: Train Loss=0.0208, Val QWK=0.7457, Val MSE=0.3672


                                                                                 

Prompt 6, Epoch 3: Train Loss=0.0154, Val QWK=0.8627, Val MSE=0.2222


                                                                                 

Prompt 6, Epoch 4: Train Loss=0.0128, Val QWK=0.8643, Val MSE=0.2216


                                                                                 

Prompt 6, Epoch 5: Train Loss=0.0100, Val QWK=0.8724, Val MSE=0.1930


                                                                                 

Prompt 6, Epoch 6: Train Loss=0.0100, Val QWK=0.8729, Val MSE=0.2042


                                                                                 

Prompt 6, Epoch 7: Train Loss=0.0079, Val QWK=0.8670, Val MSE=0.3158


                                                                                 

Prompt 6, Epoch 8: Train Loss=0.0076, Val QWK=0.8808, Val MSE=0.1984


                                                                                  

Prompt 6, Epoch 9: Train Loss=0.0062, Val QWK=0.8625, Val MSE=0.2452


                                                                                  

Prompt 6, Epoch 10: Train Loss=0.0071, Val QWK=0.8748, Val MSE=0.1834
✅ Prompt 6 | Test QWK=0.8739, Test MSE=0.1863
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive6       6      0.880808  0.873887   0.18626


In [40]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive7",
    prompts=range(7,8),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 7 with embedder 'contrastive7'


                                                                                 

Prompt 7, Epoch 1: Train Loss=0.0377, Val QWK=0.7709, Val MSE=11.8826


                                                                                 

Prompt 7, Epoch 2: Train Loss=0.0138, Val QWK=0.8895, Val MSE=4.6976


                                                                                 

Prompt 7, Epoch 3: Train Loss=0.0110, Val QWK=0.8181, Val MSE=5.0987


                                                                                  

Prompt 7, Epoch 4: Train Loss=0.0090, Val QWK=0.8961, Val MSE=3.1969


                                                                                 

Prompt 7, Epoch 5: Train Loss=0.0072, Val QWK=0.9027, Val MSE=2.8713


                                                                                 

Prompt 7, Epoch 6: Train Loss=0.0058, Val QWK=0.9027, Val MSE=3.6051


                                                                                 

Prompt 7, Epoch 7: Train Loss=0.0063, Val QWK=0.9027, Val MSE=3.4349


                                                                                 

Prompt 7, Epoch 8: Train Loss=0.0060, Val QWK=0.9085, Val MSE=3.1186


                                                                                 

Prompt 7, Epoch 9: Train Loss=0.0051, Val QWK=0.8803, Val MSE=3.4439


                                                                                  

Prompt 7, Epoch 10: Train Loss=0.0053, Val QWK=0.9045, Val MSE=3.3454
✅ Prompt 7 | Test QWK=0.9234, Test MSE=2.9050
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive7       7      0.908535  0.923419  2.904962


In [41]:
# choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Example: train Roberta per-prompt
summary_deberta = train_all_prompts(
    embedder="contrastive8",
    prompts=range(8,9),
    num_epochs=10,
    batch_size=8,
    lr=2e-5,
    patience=3,
    max_len=256,
    device=device
)

# See summary
print(summary_deberta[["embedder","prompt","best_val_qwk","test_qwk","test_mse"]])


 Training Essay Set 8 with embedder 'contrastive8'


                                                                               

Prompt 8, Epoch 1: Train Loss=0.0640, Val QWK=0.0543, Val MSE=41.9242


                                                                               

Prompt 8, Epoch 2: Train Loss=0.0185, Val QWK=0.5270, Val MSE=28.7059


                                                                               

Prompt 8, Epoch 3: Train Loss=0.0121, Val QWK=0.7096, Val MSE=23.5106


                                                                                

Prompt 8, Epoch 4: Train Loss=0.0097, Val QWK=0.8823, Val MSE=8.1306


                                                                               

Prompt 8, Epoch 5: Train Loss=0.0065, Val QWK=0.6627, Val MSE=19.8974


                                                                                

Prompt 8, Epoch 6: Train Loss=0.0066, Val QWK=0.7511, Val MSE=15.7626


                                                                               

Prompt 8, Epoch 7: Train Loss=0.0057, Val QWK=0.8338, Val MSE=10.1755
Early stopping triggered.
✅ Prompt 8 | Test QWK=0.8200, Test MSE=8.3288
       embedder  prompt  best_val_qwk  test_qwk  test_mse
0  contrastive8       8      0.882287  0.820044  8.328772
