In [31]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

from transformers import BertTokenizer, BertModel

In [32]:
def split_in_sets(data):
    essay_sets = []
    min_scores = []
    max_scores = []

    for s in range(1, 9):
        # Filter one essay set
        essay_set = data[data["essay_set"] == s].copy()

        # Keep only the necessary columns
        cols_to_keep = ["essay_id", "essay_set", "domain1_score"]
        if "essay_clean" in essay_set.columns:
            cols_to_keep.append("essay_clean")  # use cleaned essay text

        essay_set = essay_set[cols_to_keep]
        essay_set = essay_set.rename(columns={"essay_clean": "essay"})  # rename for consistency

        # Stats
        n, d = essay_set.shape
        set_scores = essay_set["domain1_score"]
        print(f"Set {s}: Essays = {n}, Attributes = {d}")

        # Track score ranges
        min_scores.append(set_scores.min())
        max_scores.append(set_scores.max())

        essay_sets.append(essay_set)

    return essay_sets, min_scores, max_scores


In [33]:
from bs4 import BeautifulSoup
def bert_friendly_clean(text, normalize_ner=True):
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Normalize spaces
    text = re.sub(r"\s+", " ", text).strip()

    # 3. Handle contractions
    contractions = {
        "can't": "cannot", "won't": "will not", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"
    }
    for contr, expand in contractions.items():
        text = re.sub(contr, expand, text)

    # 4. Handle NER tokens (e.g. @PERSON1 → [ENTITY])
    if normalize_ner:
        text = re.sub(r"@\w+\d*", "[ENTITY]", text)

    return text

In [34]:
# Load dataset
dataset_path = "training_set_rel3.tsv"
data = pd.read_csv(dataset_path, sep="\t", encoding="ISO-8859-1")

# Clean all essays upfront
print("Cleaning essays for BERT...")
data["essay_clean"] = data["essay"].apply(lambda x: bert_friendly_clean(str(x)))

# Save cleaned dataset to disk
cleaned_path = "training_set_rel3_cleaned.tsv"
data.to_csv(cleaned_path, sep="\t", index=False, encoding="utf-8")
print(f"✅ Cleaned dataset saved to {cleaned_path}")

# Split into sets and get score ranges
essay_sets, min_scores, max_scores = split_in_sets(data)

# Unpack sets
set1, set2, set3, set4, set5, set6, set7, set8 = tuple(essay_sets)
sets = [set1, set2, set3, set4, set5, set6, set7, set8]

print("Score ranges:", list(zip(min_scores, max_scores)))

# Quick check: raw vs clean
print("\nOriginal essay sample:")
print(data.loc[0, "essay"][:300])

print("\nCleaned essay sample:")
print(data.loc[0, "essay_clean"][:300])

Cleaning essays for BERT...
✅ Cleaned dataset saved to training_set_rel3_cleaned.tsv
Set 1: Essays = 1783, Attributes = 4
Set 2: Essays = 1800, Attributes = 4
Set 3: Essays = 1726, Attributes = 4
Set 4: Essays = 1770, Attributes = 4
Set 5: Essays = 1805, Attributes = 4
Set 6: Essays = 1800, Attributes = 4
Set 7: Essays = 1569, Attributes = 4
Set 8: Essays = 723, Attributes = 4
Score ranges: [(2, 12), (1, 6), (0, 3), (0, 3), (0, 4), (0, 4), (2, 24), (10, 60)]

Original essay sample:
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is alw

Cleaned essay sample:
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(as

In [35]:

class EssayDataset():
    def __init__(self, texts, scores, tokenizer, max_len=256):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        score = self.scores[idx]

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "score": torch.tensor(score, dtype=torch.float32)
        }

In [36]:
from transformers import AutoModel


class BertRegressor(nn.Module):
    def __init__(self, model_name="distilroberta-base"):
        super(BertRegressor, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled = self.dropout(pooled)
        return self.fc(pooled)

In [37]:
def train_fold(model, train_loader, val_loader, epochs, lr, device):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        # ---- Training ----
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            scores = batch["score"].to(device).unsqueeze(1)

            optimizer.zero_grad()
            preds = model(input_ids, attention_mask)
            loss = criterion(preds, scores)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_loader):.4f}")
        # ---- Evaluation ----
    model.eval()
    preds_all, y_all = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            scores = batch["score"]

            preds = model(input_ids, attention_mask).cpu().numpy()
            preds_all.extend(preds.flatten())
            y_all.extend(scores.numpy())

    preds_all = np.rint(np.nan_to_num(preds_all))  # round to nearest int
    qwk = cohen_kappa_score(y_all, preds_all, weights="quadratic")
    return qwk

In [38]:
from sklearn.model_selection import KFold


def train_kfold(texts, scores, tokenizer, model_name="distilroberta-base",
                k=5, batch_size=16, epochs=3, lr=2e-5, device="cpu"):
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    results = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(texts), 1):
        print(f"\n-------- Fold {fold} --------\n")

        train_dataset = EssayDataset(texts[train_idx], scores[train_idx], tokenizer)
        val_dataset = EssayDataset(texts[val_idx], scores[val_idx], tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        model = BertRegressor(model_name=model_name).to(device)
        qwk = train_fold(model, train_loader, val_loader, epochs, lr, device)
        print(f"Fold {fold} QWK: {qwk:.4f}")
        results.append(qwk)

    return results

In [39]:
from transformers import AutoTokenizer


def train_each_set(data, model_name="distilroberta-base",
                   k=5, batch_size=16, epochs=3, lr=2e-5, device="cpu"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    results_per_set = {}

    for set_id in sorted(data["essay_set"].unique()):
        print(f"\n==============================")
        print(f" Training Essay Set {set_id} ")
        print(f"==============================")

        X_set = data.loc[data["essay_set"] == set_id, "essay_clean"].values
        y_set = data.loc[data["essay_set"] == set_id, "domain1_score"].values

        res = train_kfold(
            X_set, y_set,
            tokenizer=tokenizer,
            model_name=model_name,
            k=k,
            batch_size=batch_size,
            epochs=epochs,
            lr=lr,
            device=device
        )

        print(f"\nEssay Set {set_id} - Avg QWK: {np.mean(res):.4f}\n")
        results_per_set[set_id] = res

    return results_per_set

In [123]:
if __name__ == "__main__":
    model_name = "distilroberta-base"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    results = train_each_set(
        data,  # your cleaned DataFrame
        model_name=model_name,
        k=5,
        batch_size=4,
        epochs=3,
        lr=2e-5,
        device=device
    )

    print("\nFinal Results per Essay Set:")
    for set_id, res in results.items():
        print(f"Set {set_id}: {np.mean(res):.4f}")


 Training Essay Set 1 

-------- Fold 1 --------



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

                                                            

Epoch 1: Train Loss = 4.7527


                                                            

Epoch 2: Train Loss = 1.3490


                                                            

Epoch 3: Train Loss = 1.1657
Fold 1 QWK: 0.5659

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 4.6278


                                                            

Epoch 2: Train Loss = 1.3759


                                                            

Epoch 3: Train Loss = 1.1239
Fold 2 QWK: 0.5903

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 5.0677


                                                            

Epoch 2: Train Loss = 1.4534


                                                            

Epoch 3: Train Loss = 1.2233
Fold 3 QWK: 0.7116

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 4.6950


                                                            

Epoch 2: Train Loss = 1.3934


                                                            

Epoch 3: Train Loss = 1.1897
Fold 4 QWK: 0.5794

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 4.3828


                                                            

Epoch 2: Train Loss = 1.3653


                                                            

Epoch 3: Train Loss = 1.1225
Fold 5 QWK: 0.6740

Essay Set 1 - Avg QWK: 0.6242


 Training Essay Set 2 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 0.8396


                                                            

Epoch 2: Train Loss = 0.4148


                                                            

Epoch 3: Train Loss = 0.3426
Fold 1 QWK: 0.4604

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 0.9075


                                                            

Epoch 2: Train Loss = 0.3900


                                                            

Epoch 3: Train Loss = 0.3543
Fold 2 QWK: 0.4269

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 0.8993


                                                            

Epoch 2: Train Loss = 0.4374


                                                            

Epoch 3: Train Loss = 0.3472
Fold 3 QWK: 0.5294

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 1.0219


                                                            

Epoch 2: Train Loss = 0.4143


                                                            

Epoch 3: Train Loss = 0.3475
Fold 4 QWK: 0.5103

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 0.7790


                                                            

Epoch 2: Train Loss = 0.4605


                                                            

Epoch 3: Train Loss = 0.3735
Fold 5 QWK: 0.6134

Essay Set 2 - Avg QWK: 0.5081


 Training Essay Set 3 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 0.5987


                                                            

Epoch 2: Train Loss = 0.4106


                                                            

Epoch 3: Train Loss = 0.3686
Fold 1 QWK: 0.7022

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 0.5019


                                                            

Epoch 2: Train Loss = 0.3610


                                                            

Epoch 3: Train Loss = 0.3239
Fold 2 QWK: 0.6547

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 0.6289


                                                            

Epoch 2: Train Loss = 0.3976


                                                            

Epoch 3: Train Loss = 0.3407
Fold 3 QWK: 0.6899

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 0.6028


                                                            

Epoch 2: Train Loss = 0.4209


                                                            

Epoch 3: Train Loss = 0.3651
Fold 4 QWK: 0.7583

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 0.5379


                                                            

Epoch 2: Train Loss = 0.3909


                                                            

Epoch 3: Train Loss = 0.3223
Fold 5 QWK: 0.7306

Essay Set 3 - Avg QWK: 0.7072


 Training Essay Set 4 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 0.5032


                                                            

Epoch 2: Train Loss = 0.3309


                                                            

Epoch 3: Train Loss = 0.2678
Fold 1 QWK: 0.8029

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 0.4499


                                                            

Epoch 2: Train Loss = 0.2793


                                                            

Epoch 3: Train Loss = 0.2180
Fold 2 QWK: 0.8192

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 0.4376


                                                            

Epoch 2: Train Loss = 0.2636


                                                            

Epoch 3: Train Loss = 0.1977
Fold 3 QWK: 0.8293

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 0.4940


                                                            

Epoch 2: Train Loss = 0.3158


                                                            

Epoch 3: Train Loss = 0.2606
Fold 4 QWK: 0.8376

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 0.4455


                                                            

Epoch 2: Train Loss = 0.2955


                                                            

Epoch 3: Train Loss = 0.2311
Fold 5 QWK: 0.8221

Essay Set 4 - Avg QWK: 0.8222


 Training Essay Set 5 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 0.6424


                                                            

Epoch 2: Train Loss = 0.3137


                                                            

Epoch 3: Train Loss = 0.2576
Fold 1 QWK: 0.8018

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 0.6706


                                                            

Epoch 2: Train Loss = 0.3954


                                                            

Epoch 3: Train Loss = 0.3018
Fold 2 QWK: 0.8041

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 0.7073


                                                            

Epoch 2: Train Loss = 0.3898


                                                            

Epoch 3: Train Loss = 0.3193
Fold 3 QWK: 0.7118

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 0.7335


                                                            

Epoch 2: Train Loss = 0.3921


                                                            

Epoch 3: Train Loss = 0.3036
Fold 4 QWK: 0.7872

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 0.7174


                                                            

Epoch 2: Train Loss = 0.3552


                                                            

Epoch 3: Train Loss = 0.3076
Fold 5 QWK: 0.8394

Essay Set 5 - Avg QWK: 0.7889


 Training Essay Set 6 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 0.7703


                                                            

Epoch 2: Train Loss = 0.3629


                                                            

Epoch 3: Train Loss = 0.2788
Fold 1 QWK: 0.8194

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 0.6799


                                                            

Epoch 2: Train Loss = 0.3853


                                                            

Epoch 3: Train Loss = 0.3214
Fold 2 QWK: 0.8293

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 0.6944


                                                            

Epoch 2: Train Loss = 0.3414


                                                            

Epoch 3: Train Loss = 0.2820
Fold 3 QWK: 0.7844

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 0.7494


                                                            

Epoch 2: Train Loss = 0.3358


                                                            

Epoch 3: Train Loss = 0.2856
Fold 4 QWK: 0.7901

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 0.6350


                                                            

Epoch 2: Train Loss = 0.3108


                                                            

Epoch 3: Train Loss = 0.2489
Fold 5 QWK: 0.8114

Essay Set 6 - Avg QWK: 0.8069


 Training Essay Set 7 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 51.2830


                                                            

Epoch 2: Train Loss = 23.0460


                                                            

Epoch 3: Train Loss = 15.3277
Fold 1 QWK: 0.6542

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 51.4245


                                                            

Epoch 2: Train Loss = 22.8345


                                                            

Epoch 3: Train Loss = 15.1435
Fold 2 QWK: 0.5264

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 53.3605


                                                            

Epoch 2: Train Loss = 24.4927


                                                            

Epoch 3: Train Loss = 16.2061
Fold 3 QWK: 0.6221

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 53.9123


                                                            

Epoch 2: Train Loss = 24.3095


                                                            

Epoch 3: Train Loss = 15.9082
Fold 4 QWK: 0.6058

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 50.8897


                                                            

Epoch 2: Train Loss = 22.7609


                                                            

Epoch 3: Train Loss = 14.7496
Fold 5 QWK: 0.5265

Essay Set 7 - Avg QWK: 0.5870


 Training Essay Set 8 

-------- Fold 1 --------



                                                            

Epoch 1: Train Loss = 764.4016


                                                            

Epoch 2: Train Loss = 595.8368


                                                            

Epoch 3: Train Loss = 526.8772
Fold 1 QWK: 0.0000

-------- Fold 2 --------



                                                            

Epoch 1: Train Loss = 782.5676


                                                            

Epoch 2: Train Loss = 598.4644


                                                            

Epoch 3: Train Loss = 533.6239
Fold 2 QWK: 0.0000

-------- Fold 3 --------



                                                            

Epoch 1: Train Loss = 803.7937


                                                            

Epoch 2: Train Loss = 624.2733


                                                            

Epoch 3: Train Loss = 554.3699
Fold 3 QWK: 0.0000

-------- Fold 4 --------



                                                            

Epoch 1: Train Loss = 768.1297


                                                            

Epoch 2: Train Loss = 588.3778


                                                            

Epoch 3: Train Loss = 522.6014
Fold 4 QWK: 0.0000

-------- Fold 5 --------



                                                            

Epoch 1: Train Loss = 779.9535


                                                            

Epoch 2: Train Loss = 606.4785


                                                            

Epoch 3: Train Loss = 536.3102
Fold 5 QWK: 0.0000

Essay Set 8 - Avg QWK: 0.0000


Final Results per Essay Set:
Set 1: 0.6242
Set 2: 0.5081
Set 3: 0.7072
Set 4: 0.8222
Set 5: 0.7889
Set 6: 0.8069
Set 7: 0.5870
Set 8: 0.0000
