In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import XLMRobertaModel, XLMRobertaTokenizer, Trainer, TrainingArguments, PreTrainedModel
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from itertools import combinations

import transformers
transformers.logging.set_verbosity_error()

file_path = "cleaned_full_grouped_sqp.csv"
df = pd.read_csv(file_path)

# Split by group_id
unique_ids = df["group_id"].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
train_df = df[df["group_id"].isin(train_ids)].copy()
test_df = df[df["group_id"].isin(test_ids)].copy()

# Preprocess text
def preprocess_text(row):
    if len(str(row['Introduction text'])) > 10:
        return row['Introduction text'] + " " + row['Request for answer text']
    else:
        return "</no introduction text/> " + row['Request for answer text']

train_df['Processed Request for answer text'] = train_df.apply(preprocess_text, axis=1)
test_df['Processed Request for answer text'] = test_df.apply(preprocess_text, axis=1)

# ========== Dataset ==========

class JointQualityDataset(Dataset):
    def __init__(self, tokenizer, df, task="regression"):
        self.task = task
        self.tokenizer = tokenizer
        self.df = df.reset_index(drop=True)
        self.max_length = 128
        if task == "pairwise":
            self.pairs = self._make_pairs()

    def _make_pairs(self):
        pairs = []
        for _, group in self.df.groupby(["Study", "Country", "ItemConcept"]):
            group = group.reset_index(drop=True)
            if len(group) < 2:
                continue
            for i, j in combinations(range(len(group)), 2):
                row_i, row_j = group.loc[i], group.loc[j]
                label = 1 if row_i["quality(q^2)"] > row_j["quality(q^2)"] else 0
                pairs.append((row_i, row_j, label))
        return pairs

    def __len__(self):
        return len(self.df) if self.task == "regression" else len(self.pairs)

    def __getitem__(self, idx):
        if self.task == "regression":
            row = self.df.loc[idx]
            text = row["Processed Request for answer text"]
            answer = row["Answer options text"]
            inputs = self.tokenizer(text, answer, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            item = {k: v.squeeze(0) for k, v in inputs.items()}
            item['labels'] = torch.tensor(row["quality(q^2)"], dtype=torch.float)
            return item
        else:
            row_i, row_j, label = self.pairs[idx]
            text_i = row_i["Processed Request for answer text"]
            text_j = row_j["Processed Request for answer text"]
            ans_i = row_i["Answer options text"]
            ans_j = row_j["Answer options text"]
            inputs_i = self.tokenizer(text_i, ans_i, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            inputs_j = self.tokenizer(text_j, ans_j, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            item = {
                'input_ids_i': inputs_i['input_ids'].squeeze(0),
                'attention_mask_i': inputs_i['attention_mask'].squeeze(0),
                'input_ids_j': inputs_j['input_ids'].squeeze(0),
                'attention_mask_j': inputs_j['attention_mask'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.float)
            }
            return item




In [None]:
# ========== Model ==========

class JointRobertaModel(nn.Module):
    def __init__(self, model_name="xlm-roberta-base"):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained(model_name)
        self.reg_head = nn.Linear(self.encoder.config.hidden_size, 1)
        self.pair_head = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids=None, attention_mask=None,
                input_ids_i=None, attention_mask_i=None,
                input_ids_j=None, attention_mask_j=None,
                labels=None, task="regression"):

        if task == "regression":
            out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_output = out.last_hidden_state[:, 0, :]
            score = self.reg_head(cls_output).squeeze(-1)
            loss = None
            if labels is not None:
                loss = nn.MSELoss()(score, labels)
            return {"loss": loss, "logits": score}

        elif task == "pairwise":
            out_i = self.encoder(input_ids=input_ids_i, attention_mask=attention_mask_i).last_hidden_state[:, 0, :]
            out_j = self.encoder(input_ids=input_ids_j, attention_mask=attention_mask_j).last_hidden_state[:, 0, :]
            diff = self.pair_head(out_i - out_j).squeeze(-1)
            loss = None
            if labels is not None:
                loss = nn.BCEWithLogitsLoss()(diff, labels)
            return {"loss": loss, "logits": diff}




In [None]:

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

train_reg_dataset = JointQualityDataset(tokenizer, train_df, task="regression")
test_reg_dataset = JointQualityDataset(tokenizer, test_df, task="regression")

train_pair_dataset = JointQualityDataset(tokenizer, train_df, task="pairwise")
test_pair_dataset = JointQualityDataset(tokenizer, test_df, task="pairwise")

from torch.utils.data import DataLoader
train_reg_loader = DataLoader(train_reg_dataset, batch_size=16, shuffle=True)
train_pair_loader = DataLoader(train_pair_dataset, batch_size=16, shuffle=True)


In [None]:
model = JointRobertaModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=500)


In [None]:
import os
from itertools import cycle
from sklearn.metrics import mean_squared_error, accuracy_score

save_dir = "./saved_models_joint"
os.makedirs(save_dir, exist_ok=True)

global_step = 0
eval_interval = 300

for epoch in range(30):  # num_train_epochs
    model.train()
    reg_iter = iter(train_reg_loader)
    pair_iter = cycle(train_pair_loader)

    for step, reg_batch in enumerate(reg_iter):
        global_step += 1

        # === Regression step ===
        for k in ['input_ids', 'attention_mask', 'labels']:
            reg_batch[k] = reg_batch[k].to(device)

        out_reg = model(
            input_ids=reg_batch['input_ids'],
            attention_mask=reg_batch['attention_mask'],
            labels=reg_batch['labels'],
            task="regression"
        )
        loss_reg = out_reg['loss']

        # === Pairwise step ===
        pair_batch = next(pair_iter)
        for k in pair_batch:
            pair_batch[k] = pair_batch[k].to(device)

        out_pair = model(
            input_ids_i=pair_batch['input_ids_i'],
            attention_mask_i=pair_batch['attention_mask_i'],
            input_ids_j=pair_batch['input_ids_j'],
            attention_mask_j=pair_batch['attention_mask_j'],
            labels=pair_batch['labels'],
            task="pairwise"
        )
        loss_pair = out_pair['loss']

        # === Total loss + update ===
        total_loss = loss_reg + loss_pair
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        scheduler.step()

        if global_step % eval_interval == 0:
            print(f"\n>>> Epoch {epoch+1} | Step {global_step} | Loss_reg: {loss_reg.item():.4f} | Loss_pair: {loss_pair.item():.4f}")

            # === Save model ===
            save_path = os.path.join(save_dir, f"checkpoint_step{global_step}.pt")
            torch.save(model.state_dict(), save_path)
            print(f"[Saved] Model saved to {save_path}")

            # === Eval on test_df ===
            model.eval()
            # -- Regression eval --
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch in DataLoader(test_reg_dataset, batch_size=32):
                    for k in ['input_ids', 'attention_mask', 'labels']:
                        batch[k] = batch[k].to(device)
                    out = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        task="regression"
                    )
                    all_preds.extend(out['logits'].cpu().numpy())
                    all_labels.extend(batch['labels'].cpu().numpy())
            mse = mean_squared_error(all_labels, all_preds)

            # -- Pairwise eval --
            correct, total = 0, 0
            with torch.no_grad():
                for batch in DataLoader(test_pair_dataset, batch_size=32):
                    for k in batch:
                        batch[k] = batch[k].to(device)
                    out = model(
                        input_ids_i=batch["input_ids_i"],
                        attention_mask_i=batch["attention_mask_i"],
                        input_ids_j=batch["input_ids_j"],
                        attention_mask_j=batch["attention_mask_j"],
                        task="pairwise"
                    )
                    probs = torch.sigmoid(out["logits"])
                    preds = (probs > 0.5).long()
                    labels = batch["labels"].long()
                    correct += (preds == labels).sum().item()
                    total += len(labels)
            acc = correct / total if total > 0 else 0.0

            print(f"[Eval @ step {global_step}] MSE: {mse:.4f} | Pairwise Acc: {acc:.4f}\n")


In [None]:
import os
from itertools import cycle
from sklearn.metrics import mean_squared_error, accuracy_score

save_dir = "./saved_models_joint"
os.makedirs(save_dir, exist_ok=True)

global_step = 0
eval_interval = 300

for epoch in range(30):  # num_train_epochs
    model.train()
    reg_iter = iter(train_reg_loader)
    pair_iter = cycle(train_pair_loader)

    for step, reg_batch in enumerate(reg_iter):
        global_step += 1

        # === Regression step ===
        for k in ['input_ids', 'attention_mask', 'labels']:
            reg_batch[k] = reg_batch[k].to(device)

        out_reg = model(
            input_ids=reg_batch['input_ids'],
            attention_mask=reg_batch['attention_mask'],
            labels=reg_batch['labels'],
            task="regression"
        )
        loss_reg = out_reg['loss']

        # === Pairwise step ===
        pair_batch = next(pair_iter)
        for k in pair_batch:
            pair_batch[k] = pair_batch[k].to(device)

        out_pair = model(
            input_ids_i=pair_batch['input_ids_i'],
            attention_mask_i=pair_batch['attention_mask_i'],
            input_ids_j=pair_batch['input_ids_j'],
            attention_mask_j=pair_batch['attention_mask_j'],
            labels=pair_batch['labels'],
            task="pairwise"
        )
        loss_pair = out_pair['loss']

        # === Total loss + update ===
        total_loss = loss_reg + 0.5 * loss_pair
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        scheduler.step()

        if global_step % eval_interval == 0:
            print(f"\n>>> Epoch {epoch+1} | Step {global_step} | Loss_reg: {loss_reg.item():.4f} | Loss_pair: {loss_pair.item():.4f}")

            # === Save model ===
            save_path = os.path.join(save_dir, f"checkpoint_step{global_step}.pt")
            torch.save(model.state_dict(), save_path)
            print(f"[Saved] Model saved to {save_path}")

            # === Eval on test_df ===
            model.eval()
            # -- Regression eval --
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch in DataLoader(test_reg_dataset, batch_size=32):
                    for k in ['input_ids', 'attention_mask', 'labels']:
                        batch[k] = batch[k].to(device)
                    out = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        task="regression"
                    )
                    all_preds.extend(out['logits'].cpu().numpy())
                    all_labels.extend(batch['labels'].cpu().numpy())
            mse = mean_squared_error(all_labels, all_preds)

            # -- Pairwise eval --
            correct, total = 0, 0
            with torch.no_grad():
                for batch in DataLoader(test_pair_dataset, batch_size=32):
                    for k in batch:
                        batch[k] = batch[k].to(device)
                    out = model(
                        input_ids_i=batch["input_ids_i"],
                        attention_mask_i=batch["attention_mask_i"],
                        input_ids_j=batch["input_ids_j"],
                        attention_mask_j=batch["attention_mask_j"],
                        task="pairwise"
                    )
                    probs = torch.sigmoid(out["logits"])
                    preds = (probs > 0.5).long()
                    labels = batch["labels"].long()
                    correct += (preds == labels).sum().item()
                    total += len(labels)
            acc = correct / total if total > 0 else 0.0

            print(f"[Eval @ step {global_step}] MSE: {mse:.4f} | Pairwise Acc: {acc:.4f}\n")


In [None]:
# ========== Combined Training Loop ==========

def train_joint_model(train_df, tokenizer, model, num_epochs=5):
    reg_dataset = JointQualityDataset(tokenizer, train_df, task="regression")
    pair_dataset = JointQualityDataset(tokenizer, train_df, task="pairwise")
    reg_loader = torch.utils.data.DataLoader(reg_dataset, batch_size=8, shuffle=True)
    pair_loader = torch.utils.data.DataLoader(pair_dataset, batch_size=8, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        for reg_batch, pair_batch in zip(reg_loader, pair_loader):
            # Regression step
            for k in ['input_ids', 'attention_mask', 'labels']:
                reg_batch[k] = reg_batch[k].to(device)
            out_reg = model(input_ids=reg_batch['input_ids'],
                            attention_mask=reg_batch['attention_mask'],
                            labels=reg_batch['labels'],
                            task="regression")
            loss_reg = out_reg['loss']

            # Pairwise step
            for k in pair_batch:
                pair_batch[k] = pair_batch[k].to(device)
            out_pair = model(input_ids_i=pair_batch['input_ids_i'],
                             attention_mask_i=pair_batch['attention_mask_i'],
                             input_ids_j=pair_batch['input_ids_j'],
                             attention_mask_j=pair_batch['attention_mask_j'],
                             labels=pair_batch['labels'],
                             task="pairwise")
            loss_pair = out_pair['loss']

            total_loss = loss_reg + 0.5 * loss_pair  # adjust Î» here
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1} done. Regression loss: {loss_reg.item():.4f}, Pairwise loss: {loss_pair.item():.4f}")