In [93]:
# Core data handling
import pandas as pd  # Load and manipulate TSV dataset (training_set_rel3.tsv)
import numpy as np  # Numerical operations for feature arrays and score normalization

# Text preprocessing
import nltk  # Stopwords, lemmatization, tokenization, and spelling correction
from nltk.corpus import stopwords, wordnet, words  # Resources for stopwords, lemmatization, and spelling
from spellchecker import SpellChecker
from nltk.tokenize import sent_tokenize, word_tokenize  # Sentence and word tokenization
from nltk.metrics.distance import edit_distance  # Edit distance for spelling correction
import re  # Regular expressions for cleaning ASAP tokens (e.g., @SOURCE1) and contractions
import string  # Punctuation removal


# Model-related (PyTorch)
import torch  # PyTorch for model and tensor conversion of preprocessed data
from torch.nn.utils.rnn import pad_sequence  # Pad sequences for LSTM input

# Evaluation and splitting
from sklearn.model_selection import train_test_split  # Stratified train-test splits by essay_set
from sklearn.metrics import cohen_kappa_score  # Quadratic Weighted Kappa for evaluation

# Optional: Visualization
import matplotlib.pyplot as plt  # Plot score distributions to identify imbalances
import seaborn as sns  # Enhanced visualization for score analysis

import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [139]:
def split_in_sets(data):
    essay_sets = []
    min_scores = []
    max_scores = []
    for s in range(1, 9):
        essay_set = data[data["essay_set"] == s].copy()  # Avoid modifying original

        # Drop irrelevant columns (specific to each set)
        columns_to_drop = ["rater1_domain1", "rater2_domain1", "rater3_domain1"]
        if s != 2:
            columns_to_drop.extend(["rater1_domain2", "rater2_domain2", "domain2_score"])
        if s not in [7, 8]:
            columns_to_drop.extend([col for col in data.columns if "trait" in col])
        essay_set = essay_set.drop(columns=[col for col in columns_to_drop if col in essay_set.columns])

        # Keep only cleaned essays (drop raw 'essay')
        if "essay" in essay_set.columns:
            essay_set = essay_set.drop(columns=["essay"])
        essay_set = essay_set.rename(columns={"essay_clean": "essay"})

        # Stats
        n, d = essay_set.shape
        set_scores = essay_set["domain1_score"]
        print(f"Set {s}: Essays = {n}, Attributes = {d}")

        min_scores.append(set_scores.min())
        max_scores.append(set_scores.max())
        essay_sets.append(essay_set)

    return essay_sets, min_scores, max_scores

In [140]:
from bs4 import BeautifulSoup
def bert_friendly_clean(text, normalize_ner=True):
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Normalize spaces
    text = re.sub(r"\s+", " ", text).strip()

    # 3. Handle contractions
    contractions = {
        "can't": "cannot", "won't": "will not", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"
    }
    for contr, expand in contractions.items():
        text = re.sub(contr, expand, text)

    # 4. Handle NER tokens (e.g. @PERSON1 → [ENTITY])
    if normalize_ner:
        text = re.sub(r"@\w+\d*", "[ENTITY]", text)

    return text

In [141]:
# Load dataset
dataset_path = "training_set_rel3.tsv"
data = pd.read_csv(dataset_path, sep="\t", encoding="ISO-8859-1")

# Clean all essays upfront
print("Cleaning essays for BERT...")
data["essay_clean"] = data["essay"].apply(lambda x: bert_friendly_clean(str(x)))

# Save cleaned dataset to disk
cleaned_path = "training_set_rel3_cleaned.tsv"
data.to_csv(cleaned_path, sep="\t", index=False, encoding="utf-8")
print(f"✅ Cleaned dataset saved to {cleaned_path}")

# Split into sets and get score ranges
essay_sets, min_scores, max_scores = split_in_sets(data)

# Unpack sets
set1, set2, set3, set4, set5, set6, set7, set8 = tuple(essay_sets)
sets = [set1, set2, set3, set4, set5, set6, set7, set8]

print("Score ranges:", list(zip(min_scores, max_scores)))

# Quick check: raw vs clean
print("\nOriginal essay sample:")
print(data.loc[0, "essay"][:300])

print("\nCleaned essay sample:")
print(data.loc[0, "essay_clean"][:300])

Cleaning essays for BERT...
✅ Cleaned dataset saved to training_set_rel3_cleaned.tsv
Set 1: Essays = 1783, Attributes = 4
Set 2: Essays = 1800, Attributes = 7
Set 3: Essays = 1726, Attributes = 4
Set 4: Essays = 1770, Attributes = 4
Set 5: Essays = 1805, Attributes = 4
Set 6: Essays = 1800, Attributes = 4
Set 7: Essays = 1569, Attributes = 22
Set 8: Essays = 723, Attributes = 22
Score ranges: [(2, 12), (1, 6), (0, 3), (0, 3), (0, 4), (0, 4), (2, 24), (10, 60)]

Original essay sample:
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is alw

Cleaned essay sample:
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(

In [136]:
texts = data["essay_clean"].tolist()
labels = data["domain1_score"].values

# Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [89]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm

# Load tokenizer & model (DistilBERT is lighter, you can swap with BertModel if you want)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
bert_model.eval()  # we’re only extracting features, not fine-tuning yet

# Pick device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

def get_bert_embeddings(texts, max_len=128, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]

        encodings = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )

        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)

        with torch.no_grad():
            outputs = bert_model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:,0,:]  # [CLS] token
        all_embeddings.append(cls_embeddings.cpu())

    return torch.cat(all_embeddings, dim=0)

# Get embeddings
X_train_emb = get_bert_embeddings(X_train)
X_val_emb   = get_bert_embeddings(X_val)
X_test_emb  = get_bert_embeddings(X_test)


100%|██████████| 284/284 [01:39<00:00,  2.84it/s]
100%|██████████| 61/61 [00:22<00:00,  2.73it/s]
100%|██████████| 61/61 [00:22<00:00,  2.73it/s]


In [111]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------
# Model (BiLSTM 400 -> 128)
# -------------------------
class EssayBiLSTM(nn.Module):
    def __init__(self, input_size=768, hidden_dim1=400, hidden_dim2=128, dropout=0.5, bidirectional=True):
        super().__init__()
        self.bidirectional = bidirectional
        self.lstm1 = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_dim1,
            num_layers=1,
            batch_first=True,
            
            bidirectional=bidirectional
        )
        self.lstm2 = nn.LSTM(
            input_size=hidden_dim1 * (2 if bidirectional else 1),
            hidden_size=hidden_dim2,
            num_layers=1,
            batch_first=True,
            
            bidirectional=bidirectional
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim2 * (2 if bidirectional else 1), 1)  # regression scalar

    def forward(self, x):
        # x: (batch, seq_len, 768). Here seq_len = 1 if you use CLS embeddings only.
        out, _ = self.lstm1(x)
        out, (h_n, _) = self.lstm2(out)
        if self.bidirectional:
            last_hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            last_hidden = h_n[-1]
        out = self.dropout(last_hidden)
        out = self.fc(out)  # linear output; clamp later if needed
        return out.squeeze(1)

In [112]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch

# ======================
# Custom Dataset
# ======================
class EssayDataset(Dataset):
    def __init__(self, embeddings, labels):
        # Convert to numpy and squeeze extra dims
        self.embeddings = np.array(embeddings).squeeze()
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Each essay embedding -> (768,)
        x = torch.tensor(self.embeddings[idx], dtype=torch.float32)
        y = torch.tensor(self.labels[idx], dtype=torch.float32)

        # Reshape for LSTM: (seq_len=1, input_size=768)
        return x.unsqueeze(0), y

# ======================
# Create datasets
# ======================
train_dataset = EssayDataset(X_train_emb, y_train)
val_dataset   = EssayDataset(X_val_emb, y_val)
test_dataset  = EssayDataset(X_test_emb, y_test)

# ======================
# DataLoaders
# ======================
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Quick shape check
xb, yb = next(iter(train_loader))
print("xb:", xb.shape, xb.dtype)
print("yb:", yb.shape, yb.dtype)


xb: torch.Size([32, 1, 768]) torch.float32
yb: torch.Size([32]) torch.float32


In [113]:
# 3. Check embeddings for NaNs/Infs
import numpy as np

print("NaNs in train embeddings:", np.isnan(X_train_emb).any())
print("Infs in train embeddings:", np.isinf(X_train_emb).any())


NaNs in train embeddings: False
Infs in train embeddings: False


In [130]:
import torch
import torch.nn as nn
import torch.optim as optim

# ======================
# Training loop
# ======================
def train_model(model, train_loader, val_loader, epochs=5, lr=1e-3, device="cuda"):
    model.to(device)
    criterion = nn.MSELoss()   # regression -> MSE
    optimizer = optim.RMSprop(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            outputs = model(xb).squeeze()   # shape [batch]
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                outputs = model(xb).squeeze()
                val_loss += criterion(outputs, yb).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_loss:.4f} | Val Loss: {val_loss:.4f}")

    return model

# ======================
# Evaluation
# ======================
def evaluate_model(model, test_loader, device="cuda"):
    model.to(device)
    model.eval()
    preds, true = [], []
    criterion = nn.MSELoss()
    test_loss = 0.0

    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            outputs = model(xb).squeeze()
            preds.extend(outputs.cpu().numpy())
            true.extend(yb.cpu().numpy())
            test_loss += criterion(outputs, yb).item()

    test_loss /= len(test_loader)
    print(f"Test MSE: {test_loss:.4f}")
    return np.array(preds), np.array(true)


In [132]:
import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = "cuda" if torch.cuda.is_available() else "cpu"

model = EssayBiLSTM()

train_model(model, train_loader, val_loader, epochs=5, device=device)

preds, true = evaluate_model(model, test_loader, device=device)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
