In [None]:
import os, math, json, re, csv, random, networkx as nx, argparse
from pathlib import Path
from typing import Dict, List, Tuple
from Vocabulary import Vocabulary

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import cohen_kappa_score

import nltk, spacy
from spellchecker import SpellChecker
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

import statistics

In [None]:
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NLP_MODEL    = "en_core_web_sm"
SBERT_MODEL  = "all-mpnet-base-v2"
SENT_OUT     = 128
BATCH_SIZE   = 16
MAX_TOKENS   = 300
EMB_DIM      = 100
HID_DIM      = 300
OUT_DIM      = 64

nltk.download("punkt", quiet=True)
lp    = spacy.load(NLP_MODEL, disable=["ner"])
spell = SpellChecker()
sbert = SentenceTransformer(SBERT_MODEL, device=str(DEVICE))
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

In [None]:
def dataset_spliter(path_to_dataset, train_path, valid_path, test_path):
    with open(path_to_dataset, newline='') as dataset:

        reader = csv.DictReader(dataset)

        with open(train_path, 'w', newline='') as train, open(valid_path, 'w', newline='') as valid, open(test_path, 'w', newline='') as test:
            train_writer = csv.DictWriter(train, fieldnames=reader.fieldnames)
            valid_writer = csv.DictWriter(valid, fieldnames=reader.fieldnames)
            test_writer = csv.DictWriter(test, fieldnames=reader.fieldnames)

            train_writer.writeheader()
            valid_writer.writeheader()
            test_writer.writeheader()

            for num, row in enumerate(reader):
                if num < 24728 * 0.6:
                    train_writer.writerow(row)
                elif num < 24728 * 0.8:
                    valid_writer.writerow(row)
                else:
                    test_writer.writerow(row)


In [None]:
def misSpell_counter(text):
    spell = SpellChecker()
    words = text.split()
    misspelled = spell.unknown(words)

    corrected_words = [
        spell.correction(w) or w if w in misspelled else w for w in words]
    if len(corrected_words) > 0:
        return len(misspelled), " ".join(corrected_words)
    else:
        return len(misspelled), None    


In [None]:
def preprocess_essay(text):
    error_count, cleaned_text = misSpell_counter(text)
    tokens = nltk.word_tokenize(cleaned_text)
    shallow_features = {}
    # ('spelling_errors', 'num_words', 'num_sentences', 'num_sentence_length', 'sentence_variance', 'num_characters', 'num_nouns', 'num_verbs', 'num_adverbs', 'num_adverbs', 'num_conjunctions',
                        # 'num_adjectives', 'num_characters', 'mean_wordLength', 'distinct_words', num_punctuations)
    
    shallow_features['spelling_errors'] = error_count
    words = text.split()
    sents = nltk.sent_tokenize(text)
    shallow_features['num_words'] = len(words)
    shallow_features['num_punctuations'] = len(tokens) - len(words)
    shallow_features['num_sentences'] = len(sents)
    shallow_features['num_sentence_length'] = shallow_features['num_words'] / shallow_features['num_sentences']
    lengths = [len(nltk.word_tokenize(sent)) for sent in sents]
    shallow_features['sentence_variance'] = statistics.variance(lengths) if len(lengths) > 1 else 0.0
    
    shallow_features['num_characters'] = 0
    shallow_features['num_nouns'] = 0
    shallow_features['num_verbs'] = 0
    shallow_features['num_adverbs'] = 0
    shallow_features['num_conjunctions'] = 0
    shallow_features['num_adjectives'] = 0

    distinct_words = set()
    for word in words:
        shallow_features['num_characters'] += len(word)
    
    shallow_features['mean_wordLength'] = shallow_features['num_characters'] / shallow_features['num_words']
    tagged = nltk.pos_tag(words, tagset='universal')
    for t in tagged:
        if t[1] == 'NOUN':
            shallow_features['num_nouns'] += 1
        elif t[1] == 'VERB':
            shallow_features['num_verbs'] += 1
        elif t[1] == 'ADV':
            shallow_features['num_adverbs'] += 1
        elif t[1] == 'CONJ':
            shallow_features['num_conjunctions'] += 1
        elif t[1] == 'ADJ':
            shallow_features['num_adjectives'] += 1
        distinct_words.add(t[0])
    shallow_features["distinct_words"] = len(distinct_words)

    return cleaned_text, shallow_features


In [None]:
class DocEncoder(nn.Module):
    def __init__(self, vocab):
        super().__init__()
        self.emb = nn.Embedding(vocab, EMB_DIM, padding_idx=0)
        self.lstm = nn.LSTM(EMB_DIM, HID_DIM, batch_first=True)
        self.proj = nn.Sequential(nn.Dropout(0.4), nn.Linear(HID_DIM, OUT_DIM), nn.Sigmoid())

    def forward(self, ids, lens):
        x = self.emb(ids)
        packed = nn.utils.rnn.pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
        _, (h, _) = self.lstm(packed)
        return self.proj(h.squeeze(0))
    

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer


class _SentenceEmbeddingLayer(nn.Module):
    def __init__(self,
                 pretrained_model="sentence-transformers/all-MiniLM-L6-v2",
                 freeze_bert=True):
        super().__init__()
        self.bert = SentenceTransformer(pretrained_model)
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
        self.embed_dim = self.bert.get_sentence_embedding_dimension()

    def forward(self, raw_sentences: list[str]) -> torch.Tensor:

        non_empty, mapping = [], []
        for s in raw_sentences:
            if s.strip():
                mapping.append(len(non_empty))
                non_empty.append(s)
            else:
                mapping.append(-1)

        if non_empty:
            embs = self.bert.encode(non_empty, convert_to_tensor=True)
        else:
            device = next(self.parameters()).device
            embs = torch.zeros((1, self.embed_dim), device=device)

        out = torch.zeros((len(raw_sentences), self.embed_dim),
                          device=embs.device)
        for i, j in enumerate(mapping):
            if j >= 0:
                out[i] = embs[j]
        return out


class _AttentionPooling(nn.Module):

    def __init__(self, hidden_dim):
        super().__init__()
        self.att = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_out, essay_lens):

        scores = self.att(lstm_out).squeeze(-1)

        max_sent = scores.size(1)
        mask = (torch.arange(max_sent, device=scores.device)
                .unsqueeze(0) < essay_lens.unsqueeze(1))
        scores = scores.masked_fill(~mask, -1e9)

        weights = F.softmax(scores, dim=1)
        pooled  = torch.sum(lstm_out * weights.unsqueeze(-1), dim=1)
        return pooled, weights


class SentenceEncoder(nn.Module):

    def __init__(self,
                 pretrained_model="sentence-transformers/all-MiniLM-L6-v2",
                 freeze_bert=True,
                 lstm_hidden_dim=256,
                 dropout=0.5):
        super().__init__()

        self.sentence_embedding = _SentenceEmbeddingLayer(pretrained_model,
                                                          freeze_bert)
        emb_dim = self.sentence_embedding.embed_dim

        self.lstm = nn.LSTM(emb_dim,
                            lstm_hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True,
                            dropout=0.0)

        self.attention_pool = _AttentionPooling(lstm_hidden_dim * 2)

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(lstm_hidden_dim * 2, lstm_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(lstm_hidden_dim, 1)
        )

    def forward(self, flat_sentences, essay_lengths, batch_size, max_sentences):
        
        sent_emb = self.sentence_embedding(flat_sentences)
        E = sent_emb.size(-1)
    
        padded = sent_emb.new_zeros(batch_size, max_sentences, E)
    
        idx = 0
        for b, n_sents in enumerate(essay_lengths.tolist()):
            if n_sents:
                padded[b, :n_sents] = sent_emb[idx: idx + n_sents]
            idx += n_sents
    
        packed = nn.utils.rnn.pack_padded_sequence(
            padded, essay_lengths.cpu(),
            batch_first=True, enforce_sorted=False
        )
        lstm_out, _ = self.lstm(packed)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(
            lstm_out, batch_first=True, total_length=max_sentences
        )
    
        pooled, att_w = self.attention_pool(unpacked, essay_lengths)
        scores = self.classifier(pooled)
        return scores.squeeze(-1), att_w


def load_sentence_encoder(ckpt_path: str | Path,
                          device: str = "cpu",
                          freeze: bool = True) -> SentenceEncoder:
    enc = SentenceEncoder()
    raw = torch.load(ckpt_path, map_location=device)

    new_state = {}
    for k, v in raw.items():

        if k.startswith("attention_pool.attention."):

            k = k.replace("attention_pool.attention.", "attention_pool.att.")

        if re.fullmatch(r"lstm\.(weight|bias)_(ih|hh)_l1(_reverse)?", k):

            continue
        new_state[k] = v

    missing, unexpected = enc.load_state_dict(new_state, strict=False)
    if missing:
        print("[SentenceEncoder]   • ignored missing keys:", missing)
    if unexpected:
        print("[SentenceEncoder]   • ignored extra keys  :", unexpected)

    if freeze:
        enc.eval()
        for p in enc.parameters():
            p.requires_grad_(False)
    return enc



In [None]:
prompts_df = pd.read_excel(args.prompts_xlsx)

if not {"essay_set", "essay_description"}.issubset(prompts_df.columns):
    raise ValueError("Excel must have columns 'essay_set' and 'essay_description'")

PROMPT_TEXT = {int(r.essay_set): str(r.essay_description)
               for r in prompts_df.itertuples()}

print("Loaded prompt texts:", PROMPT_TEXT.keys())

PROMPT_EMB = {pid: sbert.encode(txt, convert_to_tensor=True)
              for pid, txt in PROMPT_TEXT.items()}

In [None]:
class PromptSimilarity(nn.Module):
    def __init__(self): super().__init__()
    @torch.no_grad()
    def forward(self, essays: List[str], prompt_ids: torch.Tensor) -> torch.Tensor:
        essay_emb = sbert.encode(essays, convert_to_tensor=True)
        pe = torch.stack([PROMPT_EMB[int(pid.item())] for pid in prompt_ids]
                         ).to(essay_emb)
        cos = F.cosine_similarity(essay_emb, pe)
        return cos.unsqueeze(1)


In [None]:
class AESModel(nn.Module):
    def __init__(self, vocab_size: int, sent_ckpt: Path):
        super().__init__()
        self.doc   = DocEncoder(vocab_size)
        self.sent  = load_sentence_encoder(sent_ckpt)
        self.promp = PromptSimilarity()

        with torch.no_grad():
            dummy_FH   = torch.zeros(1, 14)
            dummy_ids  = torch.zeros(1, MAX_TOKENS,  dtype=torch.long)
            dummy_lens = torch.tensor([1])
            dummy_FS   = torch.zeros(1,1)
            dummy_FT   = torch.zeros(1,1)
            dummy_FD   = torch.zeros(1, OUT_DIM)
            dummy = torch.cat([dummy_FH, dummy_FD, dummy_FS, dummy_FT], 1)
            F_total = dummy.size(1)
        
        self.head = nn.Sequential(
            nn.Linear(F_total, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, FH, ids, lens, prompt_ids, sent_lists=[], essays=[]):
        FD = self.doc(ids, lens)
        flat_sents   = sum(sent_lists, [])
        essay_lens   = torch.tensor([len(s) for s in sent_lists],
                                    device=ids.device)
        if len(essays) > 0:
            B, M = len(sent_lists), max(essay_lens).item()
        else:
            B, M = 0,0

        FS, _ = self.sent(flat_sents, essay_lens, B, M)
        FS = FS.unsqueeze(1)

        FT = self.promp(essays, prompt_ids)

        fused = torch.cat([FH, FD, FS, FT], dim=1)
        return self.head(fused).squeeze(1)


In [None]:
import os, torch
CKPT_FILE = "checkpoint.pt"

def save_ckpt(epoch, best_qwk, model, optimizer):
    torch.save({
        "epoch"     : epoch,
        "best_qwk"  : best_qwk,
        "model"     : model.state_dict(),
        "optim"     : optimizer.state_dict()
    }, CKPT_FILE)
    print(f"✓ checkpoint saved: {CKPT_FILE} (epoch {epoch})")

def load_ckpt(model, optimizer):
    ckpt = torch.load(CKPT_FILE, map_location=DEVICE)
    model.load_state_dict(ckpt["model"])
    optimizer.load_state_dict(ckpt["optim"])
    print(f"↻ resumed from {CKPT_FILE} (epoch {ckpt['epoch']})")
    return ckpt["epoch"], ckpt["best_qwk"]


def train_model(model: nn.Module,
                train_loader: DataLoader,
                val_loader  : DataLoader | None = None,
                epochs: int = 10):

    model.to(DEVICE)
    opt       = torch.optim.Adam(model.parameters(), lr=2e-4)
    loss_fn   = nn.MSELoss()
    best_qwk  = -1.0
    start_ep  = 1

    if os.path.exists(CKPT_FILE):
        start_ep, best_qwk = load_ckpt(model, opt)
        start_ep += 1

    max_norm = 1.0

    for ep in range(start_ep, epochs + 1):
        model.train()
        running_loss = 0.0

        for step, batch in enumerate(train_loader, start=1):
            FH, ids, ln, lab, mx, sent_lists, texts, pids = batch
            FH, ids, ln, lab = FH.to(DEVICE), ids.to(DEVICE), ln.to(DEVICE), lab.to(DEVICE)

            opt.zero_grad()
            pred  = model(FH, ids, ln, sent_lists, texts, pids)
            loss  = loss_fn(pred, lab)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            opt.step()

            running_loss += loss.item()

            epoch_loss = running_loss / len(train_loader)
            msg = f"[{ep:02d}/{epochs}] train-MSE {epoch_loss:.4f} step: {step}"
            print(msg)        

        if val_loader is not None:
            qwk_val = evaluate(model, val_loader)
            msg += f" | val-QWK {qwk_val:.4f}"
            if qwk_val > best_qwk:
                best_qwk = qwk_val
                torch.save(model.state_dict(), "best_model.pt")
        print(msg)

        save_ckpt(ep, best_qwk, model, opt)

    if val_loader is None:
        torch.save(model.state_dict(), "best_model.pt")


In [None]:
class EssayDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vocab: Dict[str,int], has_labels: bool):
        self.df         = df.reset_index(drop=True)
        self.vocab      = vocab
        self.has_labels = has_labels

    def _encode(self, txt: str):
        ids = self.vocab.text2idx(txt)[:MAX_TOKENS]
        ids += [0] * (MAX_TOKENS - len(ids))
        return torch.tensor(ids, dtype=torch.long), len(ids)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        r        = self.df.iloc[idx]
        txt      = r["essay"]
        pid      = int(r["prompt_id"])
        ids, ln  = self._encode(txt)

        _, feat_dict = preprocess_essay(txt)
        FH = torch.tensor(list(feat_dict.values()), dtype=torch.float32)

        sent_list = nltk.sent_tokenize(txt)

        if self.has_labels:
            score_norm = torch.tensor(r["score_norm"], dtype=torch.float32)
        else:
            score_norm = torch.tensor(0.0, dtype=torch.float32)

        max_score = torch.tensor(r["max_score"], dtype=torch.float32)
        pid_tensor= torch.tensor(pid, dtype=torch.long)

        return (
            FH,
            ids,
            torch.tensor(ln, dtype=torch.long),
            score_norm,
            max_score,
            sent_list,
            txt,
            pid_tensor
        )

In [None]:
from torch.utils.data._utils.collate import default_collate

def essay_collate(batch):
    with_labels = len(batch[0]) == 8

    if with_labels:
        FH, ids, ln, score_norm, max_score, sent_list, texts, pid = zip(*batch)
        score_norm = torch.stack(score_norm)
    else:
        FH, ids, ln,        max_score, sent_list, texts, pid = zip(*batch)
        score_norm = None

    FH        = torch.stack(FH)
    ids       = torch.stack(ids)
    ln        = torch.stack(ln)
    max_score = torch.stack(max_score)
    pid       = torch.stack(pid)

    if with_labels:
        return FH, ids, ln, score_norm, max_score, list(sent_list), list(texts), pid
    else:
        return FH, ids, ln, score_norm, max_score, list(sent_list), list(texts), pid

In [None]:
train_path = "/nfs/stak/users/mettas/ondemand/AI539_HPC/Datasets/asap-aes/training_set_rel3.tsv"
val_path = "/nfs/stak/users/mettas/ondemand/AI539_HPC/Datasets/asap-aes/valid_set.tsv"
prompt_path = "/nfs/stak/users/mettas/ondemand/AI539_HPC/Datasets/asap-aes/Essay_Set_Descriptions/Essay_Set_Descriptions/essay_set_descriptions.xlsx"
sent_ckpt_path = "/nfs/stak/users/mettas/ondemand/AI539_HPC/Final_Project/best_model_SBERT0607_1712_NbrnTP_SBERT_LSTM.pth"

parser = argparse.ArgumentParser()
parser.add_argument("--train",       default=train_path)
parser.add_argument("--valid",       default=val_path)
parser.add_argument("--sent-ckpt",   default=sent_ckpt_path)
parser.add_argument("--prompts-xlsx",default=prompt_path)
args = parser.parse_args([])



In [None]:
if __name__ == "__main__":
    col_map={"essay_set":"prompt_id","domain1_score":"score"}
    tr_df = pd.read_csv(args.train, sep="\t", quoting=csv.QUOTE_NONE, engine="python",
                        encoding="ISO-8859-1").rename(columns=col_map)[["prompt_id","essay","score"]]
    va_df = pd.read_csv(args.valid, sep="\t", quoting=csv.QUOTE_NONE, engine="python",
                        encoding="ISO-8859-1")

    max_scores = pd.concat([tr_df]).groupby("prompt_id")["score"].max().to_dict()
    tr_df["max_score"]=tr_df["prompt_id"].map(max_scores)
    tr_df["score_norm"]=tr_df["score"]/tr_df["max_score"]

    vocab = Vocabulary(tr_df["essay"].tolist(), min_freq=1)
    print("Vocab size:", vocab.size)
    train_ds = EssayDataset(tr_df, vocab, True)
    val_ds   = EssayDataset(va_df, vocab, False)
    train_ld = DataLoader(train_ds,
                      batch_size=BATCH_SIZE,
                      shuffle=True,
                      num_workers=2,
                      pin_memory=True,
                      collate_fn=essay_collate)
    # if val_loader is not None:
    #     val_ds = EssayDataset(val_df, vocab, with_labels=True)
    #     val_loader = DataLoader(
    #         val_ds,
    #         batch_size=32,
    #         shuffle=False,
    #         num_workers=2,
    #         pin_memory=True,
    #         collate_fn=essay_collate
    #     )

    model = AESModel(vocab.size, Path(args.sent_ckpt))
    train_model(model,train_ld,epochs=10)

In [None]:
import pandas as pd, numpy as np, torch, csv, json, os
from torch.utils.data import DataLoader
from sklearn.metrics import cohen_kappa_score

def fisher_z(k):
    k = np.clip(k, -0.999, 0.999)
    return .5*np.log((1+k)/(1-k))

def fisher_mean(kappa_dict: dict[int,float]) -> float:
    if not kappa_dict:
        return np.nan
    z  = [fisher_z(v) for v in kappa_dict.values()]
    return float(np.tanh(np.mean(z)))

def overall_kappa(df: pd.DataFrame) -> float:
    per_prompt = {p: cohen_kappa_score(g.score, g.predicted_score,
                                       weights="quadratic")
                  for p, g in df.groupby("prompt_id")}
    return fisher_mean(per_prompt)

def run_and_score_v2(model: torch.nn.Module,
                     csv_path          : str,
                     vocab             : dict[str,int],
                     train_max_scores  : dict[int,int]|None = None,
                     batch_size        : int = 32,
                     with_labels       : bool = True,
                     out_csv           : str = "submission.csv"):
    
    df = pd.read_csv(csv_path)

    col_rename = {"full_text":"essay",
                  "assignment":"prompt_tag",
                  "score":"score"}
    df = df.rename(columns=col_rename)

    if "essay_id" not in df.columns or "essay" not in df.columns \
       or "prompt_tag" not in df.columns:
        raise ValueError("CSV must contain essay_id / full_text / assignment")

    prompt_lookup  = {tag:i for i, tag in
                      enumerate(sorted(df["prompt_tag"].unique()), start=1)}
    df["prompt_id"] = train_max_scores

    if with_labels and train_max_scores is None:
        train_max_scores = df.groupby("prompt_id")["score"].max().to_dict()
    if train_max_scores is None:
        raise ValueError("train_max_scores must be supplied for test data")

    df["max_score"] = train_max_scores

    if with_labels:
        df["score_norm"] = df["score"] / df["max_score"]

    ds = EssayDataset(df, vocab, has_labels=with_labels)
    ld = DataLoader(ds, batch_size=batch_size, shuffle=False,
                    num_workers=2, pin_memory=True, collate_fn=essay_collate)

    DEVICE = next(model.parameters()).device
    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in ld:
            FH, ids, ln, _, _, sent_lists, texts, pids = batch
            FH, ids, ln, pids = (FH.to(DEVICE),
                                 ids.to(DEVICE),
                                 ln.to(DEVICE),
                                 pids.to(DEVICE))
    
            out = model(FH, ids, ln, pids, sent_lists, texts)
            preds.append(out.cpu())
    
    df["predicted_score"] = np.round(
        torch.cat(preds).numpy() * df["max_score"].values
    ).astype(int)

    if with_labels:
        kappa_overall = overall_kappa(df)
        print(f"Overall QWK (Fisher mean): {kappa_overall:0.4f}")

        per_prompt = {p: cohen_kappa_score(g.score, g.predicted_score,
                                           weights="quadratic")
                      for p, g in df.groupby("prompt_id")}
        print("Per-prompt kappas:", per_prompt)

    df[["essay_id", "predicted_score"]].to_csv(out_csv, index=False)
    print("Saved predictions ➜", out_csv)

    return df if with_labels else None

model = AESModel(vocab.size, Path(args.sent_ckpt))
model.load_state_dict(torch.load("best_model.pt", map_location=DEVICE))
model.to(DEVICE)
model.eval()

validation_path = "/nfs/stak/users/mettas/ondemand/AI539_HPC/Datasets/ASAP/Valid.csv"
train_max = 5
run_and_score_v2(
        model,
        csv_path      = validation_path,
        vocab         = vocab,
        train_max_scores = train_max,
        with_labels   = True,
        out_csv       = "valid_preds.csv")

run_and_score(model,
                tsv_path="/nfs/stak/users/mettas/ondemand/AI539_HPC/Datasets/asap-aes/test_set.tsv",
                with_labels=False,
                vocab=vocab,
                stub_csv=None,
                out_csv="test_submission.csv")
