## Install packages

In [9]:
#%pip install -U pandas numpy scikit-learn scipy matplotlib tqdm nltk
#%pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

## Imports + config + device

In [10]:
import os, re, random, time
from dataclasses import dataclass
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

print("Torch:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

Torch: 2.5.1+cu121
Device: cuda
GPU: NVIDIA GeForce RTX 2050


## Reproducibility + global settings

In [11]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

@dataclass
class CFG:
    seed: int = 42
    data_path: str = "./data/jigsaw/train.csv"
    subsample_n: int | None = 200_000  # set None to use full file (big)
    test_size: float = 0.1
    val_size: float = 0.1  # fraction of remaining train for validation
    label_threshold: float = 0.5

    # TF-IDF
    max_features: int = 50_000
    ngram_range: tuple = (1, 2)
    min_df: int = 2

    # PyTorch training
    batch_size: int = 256
    epochs: int = 3
    lr: float = 1e-3
    weight_decay: float = 1e-4
    optimizer: str = "adam"  # "adam" or "sgd"
    momentum: float = 0.9
    grad_clip_norm: float | None = 1.0

    # Sequence model
    seq_max_len: int = 200
    vocab_size: int = 50_000
    emb_dim: int = 128
    lstm_hidden: int = 128
    lstm_layers: int = 1
    dropout: float = 0.3

cfg = CFG()
seed_everything(cfg.seed)
cfg


CFG(seed=42, data_path='./data/jigsaw/train.csv', subsample_n=200000, test_size=0.1, val_size=0.1, label_threshold=0.5, max_features=50000, ngram_range=(1, 2), min_df=2, batch_size=256, epochs=3, lr=0.001, weight_decay=0.0001, optimizer='adam', momentum=0.9, grad_clip_norm=1.0, seq_max_len=200, vocab_size=50000, emb_dim=128, lstm_hidden=128, lstm_layers=1, dropout=0.3)

## Load Jigsaw train.csv + label + subsample

In [12]:
df = pd.read_csv(cfg.data_path)
print(df.shape)
df = df.dropna(subset=["comment_text", "target"]).reset_index(drop=True)

df["label"] = (df["target"] >= cfg.label_threshold).astype(int)

# Optional: subsample (keep label distribution)
if cfg.subsample_n is not None and len(df) > cfg.subsample_n:
    df, _ = train_test_split(
        df,
        train_size=cfg.subsample_n,
        stratify=df["label"],
        random_state=cfg.seed
    )
    df = df.reset_index(drop=True)

df[["target", "label"]].describe()


(1804874, 45)


Unnamed: 0,target,label
count,200000.0,200000.0
mean,0.103228,0.07997
std,0.197294,0.271247
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.166667,0.0
max,1.0,1.0


## Train/Val/Test split

In [13]:
X = df["comment_text"].astype(str).values
y = df["label"].values

# First: split out test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=cfg.test_size,
    stratify=y,
    random_state=cfg.seed
)

# Then: split train/val from remaining
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval,
    test_size=cfg.val_size,
    stratify=y_trainval,
    random_state=cfg.seed
)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))
print("Pos rate train/val/test:", y_train.mean(), y_val.mean(), y_test.mean())


Train: 162000 Val: 18000 Test: 20000
Pos rate train/val/test: 0.07997530864197532 0.07994444444444444 0.07995


## Preprocessing: tokenization + stemming + lemmatization

In [14]:
# NLTK downloads (run once)
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

_url_re = re.compile(r"https?://\S+|www\.\S+")
_html_re = re.compile(r"<.*?>")
_user_re = re.compile(r"@\w+")
_non_ascii_re = re.compile(r"[^\x00-\x7F]+")

_token_re = re.compile(r"[A-Za-z']+")

CLEAN_TEXT = True
REMOVE_STOPWORDS = True

def clean_text(text: str) -> str:
    text = text.lower()
    text = _url_re.sub(" ", text)
    text = _html_re.sub(" ", text)
    text = _user_re.sub(" ", text)
    text = _non_ascii_re.sub(" ", text)
    return text

def basic_tokenize(text: str):
    if CLEAN_TEXT:
        text = clean_text(text)
    else:
        text = text.lower()
    toks = _token_re.findall(text)
    if REMOVE_STOPWORDS:
        toks = [t for t in toks if t not in stop_words]
    return toks

def stem_text(text: str) -> str:
    toks = basic_tokenize(text)
    toks = [stemmer.stem(t) for t in toks]
    return " ".join(toks)

def lemma_text(text: str) -> str:
    toks = basic_tokenize(text)
    toks = [lemmatizer.lemmatize(t) for t in toks]
    return " ".join(toks)

# Choose one:
PREPROC_MODE = "raw"  # "raw" | "stem" | "lemma"

def preprocess_array(arr):
    if PREPROC_MODE == "raw":
        if CLEAN_TEXT or REMOVE_STOPWORDS:
            return np.array([" ".join(basic_tokenize(x)) for x in tqdm(arr, desc=f"preprocess={PREPROC_MODE}")], dtype=object)
        return arr
    fn = stem_text if PREPROC_MODE == "stem" else lemma_text
    return np.array([fn(x) for x in tqdm(arr, desc=f"preprocess={PREPROC_MODE}")], dtype=object)
X_train_p = preprocess_array(X_train)
X_val_p   = preprocess_array(X_val)
X_test_p  = preprocess_array(X_test)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stjepan.vinski\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\stjepan.vinski\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stjepan.vinski\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
preprocess=raw: 100%|██████████| 162000/162000 [00:02<00:00, 60441.32it/s]
preprocess=raw: 100%|██████████| 18000/18000 [00:00<00:00, 62636.82it/s]
preprocess=raw: 100%|██████████| 20000/20000 [00:00<00:00, 60257.84it/s]


## Common evaluation helper

In [15]:
def eval_binary(y_true, y_pred, name="model", show_report=False):
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    print(f"{name}: acc={acc:.4f} precision={p:.4f} recall={r:.4f} f1={f1:.4f}")
    if show_report:
        print("Confusion matrix:")
        print(confusion_matrix(y_true, y_pred))
        print("Classification report:")
        print(classification_report(y_true, y_pred, digits=4, zero_division=0))
    return {"acc": acc, "precision": p, "recall": r, "f1": f1}


## Bag‑of‑Words + LogReg (baseline)

In [None]:
bow = CountVectorizer(
    max_features=cfg.max_features,
    ngram_range=cfg.ngram_range,
    min_df=cfg.min_df
)

Xtr_bow = bow.fit_transform(X_train_p)
Xva_bow = bow.transform(X_val_p)
Xte_bow = bow.transform(X_test_p)

logreg_bow = LogisticRegression(
    max_iter=200,
    n_jobs=-1,
    class_weight="balanced"
)

logreg_bow.fit(Xtr_bow, y_train)

val_pred = logreg_bow.predict(Xva_bow)
test_pred = logreg_bow.predict(Xte_bow)

metrics_bow_val = eval_binary(y_val, val_pred, "BOW+LogReg (val)")
metrics_bow_test = eval_binary(y_test, test_pred, "BOW+LogReg (test)")




BOW+LogReg (val): acc=0.9147 precision=0.4753 recall=0.6484 f1=0.5485
BOW+LogReg (test): acc=0.9193 precision=0.4964 recall=0.6842 f1=0.5753


## TF-IDF + Logistic Regression (baseline)

### Vectorize TF-IDF

In [17]:
tfidf = TfidfVectorizer(
    max_features=cfg.max_features,
    ngram_range=cfg.ngram_range,
    min_df=cfg.min_df
)

Xtr_tfidf = tfidf.fit_transform(X_train_p)
Xva_tfidf = tfidf.transform(X_val_p)
Xte_tfidf = tfidf.transform(X_test_p)

Xtr_tfidf.shape, Xva_tfidf.shape


((162000, 50000), (18000, 50000))

### Logistic Regression baseline

In [18]:
logreg = LogisticRegression(
    max_iter=200,
    n_jobs=-1,
    class_weight="balanced"  # useful for skew
)

logreg.fit(Xtr_tfidf, y_train)

val_pred = logreg.predict(Xva_tfidf)
test_pred = logreg.predict(Xte_tfidf)

metrics_logreg_val = eval_binary(y_val, val_pred, "TFIDF+LogReg (val)")
metrics_logreg_test = eval_binary(y_test, test_pred, "TFIDF+LogReg (test)")




TFIDF+LogReg (val): acc=0.9057 precision=0.4460 recall=0.7408 f1=0.5568
TFIDF+LogReg (test): acc=0.9080 precision=0.4554 recall=0.7699 f1=0.5723


## TF-IDF + MLP

### Sparse TF-IDF

In [19]:
class SparseTfidfDataset(Dataset):
    def __init__(self, X_csr, y):
        self.X = X_csr.tocsr()
        self.y = y.astype(np.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        row = self.X[idx]
        x = torch.from_numpy(row.toarray().ravel()).float()
        y = torch.tensor(self.y[idx]).float()
        return x, y

train_ds = SparseTfidfDataset(Xtr_tfidf, y_train)
val_ds   = SparseTfidfDataset(Xva_tfidf, y_val)
test_ds  = SparseTfidfDataset(Xte_tfidf, y_test)

train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False)

Xtr_tfidf.shape


(162000, 50000)

### MLP model

In [20]:
class TfidfMLP(nn.Module):
    def __init__(self, in_dim, hidden=256, dropout=0.3, activation="relu"):
        super().__init__()
        self.activation = activation.lower()
        self.fc1 = nn.Linear(in_dim, hidden)
        self.bn1 = nn.BatchNorm1d(hidden)
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden, 1)

        # weight init
        if self.activation == "relu":
            nn.init.kaiming_normal_(self.fc1.weight)
        else:
            nn.init.xavier_normal_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_normal_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        x = self.bn1(self.fc1(x))
        if self.activation == "relu":
            x = F.relu(x)
        elif self.activation == "tanh":
            x = torch.tanh(x)
        elif self.activation == "sigmoid":
            x = torch.sigmoid(x)
        else:
            raise ValueError("activation must be relu/tanh/sigmoid")

        x = self.drop(x)
        return self.fc2(x).squeeze(1)  # logits


### Train/eval

In [21]:
def make_optimizer(model):
    if cfg.optimizer.lower() == "adam":
        return torch.optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    if cfg.optimizer.lower() == "sgd":
        return torch.optim.SGD(model.parameters(), lr=cfg.lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay)
    raise ValueError("optimizer must be adam/sgd")

@torch.no_grad()
def eval_loader(model, loader):
    model.eval()
    ys, ps = [], []
    for x, y in loader:
        x = x.to(device)
        logits = model(x)
        prob = torch.sigmoid(logits).cpu().numpy()
        ys.append(y.numpy())
        ps.append(prob)
    y_true = np.concatenate(ys)
    y_prob = np.concatenate(ps)
    y_pred = (y_prob >= 0.5).astype(int)
    return y_true.astype(int), y_prob, y_pred

def fit_binary(model, train_loader, val_loader, epochs=3):
    model = model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    opt = make_optimizer(model)

    best_f1 = -1
    best_state = None

    for ep in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        n = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)

            opt.zero_grad(set_to_none=True)
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()

            if cfg.grad_clip_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip_norm)

            opt.step()

            bs = x.size(0)
            total_loss += loss.item() * bs
            n += bs

        yv, pv, yv_pred = eval_loader(model, val_loader)
        m = eval_binary(yv, yv_pred, f"MLP val ep{ep}")
        print(f"Epoch {ep}/{epochs} train_loss={total_loss/n:.4f}")

        if m["f1"] > best_f1:
            best_f1 = m["f1"]
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)

    return model


### Train the MLP

In [22]:
cfg.optimizer = "adam"
cfg.lr = 1e-3
cfg.epochs = 3

mlp = TfidfMLP(in_dim=Xtr_tfidf.shape[1], hidden=256, dropout=0.3, activation="relu")
mlp = fit_binary(mlp, train_loader, val_loader, epochs=cfg.epochs)

yt, pt, ypred = eval_loader(mlp, test_loader)
metrics_mlp_test = eval_binary(yt, ypred, "TFIDF+MLP (test)")


MLP val ep1: acc=0.9406 precision=0.7387 recall=0.3968 f1=0.5163
Epoch 1/3 train_loss=0.2128
MLP val ep2: acc=0.9386 precision=0.6951 recall=0.4135 f1=0.5185
Epoch 2/3 train_loss=0.1166
MLP val ep3: acc=0.9356 precision=0.6346 recall=0.4587 f1=0.5325
Epoch 3/3 train_loss=0.0754
TFIDF+MLP (test): acc=0.9399 precision=0.6653 recall=0.5009 f1=0.5715


## LSTM and BiLSTM+Attention

### Build vocabulary + encode sequences

In [23]:
PAD = 0
UNK = 1

def build_vocab(texts, vocab_size=50_000):
    counter = Counter()
    for t in tqdm(texts, desc="build_vocab"):
        counter.update(basic_tokenize(t))
    most = counter.most_common(vocab_size - 2)
    stoi = {"<PAD>": PAD, "<UNK>": UNK}
    for i, (w, _) in enumerate(most, start=2):
        stoi[w] = i
    return stoi

stoi = build_vocab(X_train_p, vocab_size=cfg.vocab_size)
itos_size = len(stoi)
itos_size


build_vocab:   0%|          | 0/162000 [00:00<?, ?it/s]

build_vocab: 100%|██████████| 162000/162000 [00:02<00:00, 68848.32it/s]


50000

In [24]:
def encode_text(text, stoi, max_len=200):
    toks = basic_tokenize(text)
    ids = [stoi.get(w, UNK) for w in toks[:max_len]]
    if len(ids) == 0:
        ids = [UNK]
    return ids


def pad_batch(batch_ids, pad_id=PAD):
    lens = torch.tensor([max(1, len(x)) for x in batch_ids], dtype=torch.long)
    maxlen = int(lens.max().item())
    padded = torch.full((len(batch_ids), maxlen), pad_id, dtype=torch.long)

    for i, ids in enumerate(batch_ids):
        if len(ids) == 0:
            ids = [UNK]
        padded[i, :len(ids)] = torch.tensor(ids, dtype=torch.long)

    return padded, lens



### Sequence Dataset + collate_fn

In [25]:
class SeqDataset(Dataset):
    def __init__(self, texts, labels, stoi, max_len):
        self.texts = texts
        self.labels = labels.astype(np.float32)
        self.stoi = stoi
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        ids = encode_text(self.texts[idx], self.stoi, self.max_len)
        y = self.labels[idx]
        return ids, y

def collate_fn(batch):
    ids_list, ys = zip(*batch)
    x, lens = pad_batch(ids_list, pad_id=PAD)
    y = torch.tensor(ys, dtype=torch.float32)
    return x, lens, y

seq_train = SeqDataset(X_train_p, y_train, stoi, cfg.seq_max_len)
seq_val   = SeqDataset(X_val_p, y_val, stoi, cfg.seq_max_len)
seq_test  = SeqDataset(X_test_p, y_test, stoi, cfg.seq_max_len)

seq_train_loader = DataLoader(seq_train, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn)
seq_val_loader   = DataLoader(seq_val, batch_size=cfg.batch_size, shuffle=False, collate_fn=collate_fn)
seq_test_loader  = DataLoader(seq_test, batch_size=cfg.batch_size, shuffle=False, collate_fn=collate_fn)


### LSTM classifier

In [26]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128, layers=1, dropout=0.3, bidir=False):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.lstm = nn.LSTM(
            emb_dim, hidden, num_layers=layers,
            batch_first=True, dropout=dropout if layers > 1 else 0.0,
            bidirectional=bidir
        )
        out_dim = hidden * (2 if bidir else 1)
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, 1)

        nn.init.xavier_normal_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x, lens):
        emb = self.emb(x)
        packed = nn.utils.rnn.pack_padded_sequence(emb, lens.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h, c) = self.lstm(packed)

        # last layer hidden
        if self.lstm.bidirectional:
            h_last = torch.cat([h[-2], h[-1]], dim=1)
        else:
            h_last = h[-1]

        h_last = self.drop(h_last)
        logits = self.fc(h_last).squeeze(1)
        return logits


### Train/eval loops for sequence models

In [27]:
@torch.no_grad()
def eval_seq(model, loader):
    model.eval()
    ys, ps = [], []
    for x, lens, y in loader:
        x, lens = x.to(device), lens.to(device)
        logits = model(x, lens)
        prob = torch.sigmoid(logits).cpu().numpy()
        ys.append(y.numpy())
        ps.append(prob)
    y_true = np.concatenate(ys).astype(int)
    y_prob = np.concatenate(ps)
    y_pred = (y_prob >= 0.5).astype(int)
    return y_true, y_prob, y_pred

def fit_seq(model, train_loader, val_loader, epochs=3):
    model = model.to(device)
    crit = nn.BCEWithLogitsLoss()
    opt = make_optimizer(model)

    best_f1 = -1
    best_state = None

    for ep in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        n = 0

        for x, lens, y in train_loader:
            x, lens, y = x.to(device), lens.to(device), y.to(device)
            opt.zero_grad(set_to_none=True)
            logits = model(x, lens)
            loss = crit(logits, y)
            loss.backward()

            if cfg.grad_clip_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip_norm)

            opt.step()

            bs = x.size(0)
            total_loss += loss.item() * bs
            n += bs

        yv, pv, yv_pred = eval_seq(model, val_loader)
        m = eval_binary(yv, yv_pred, f"SEQ val ep{ep}")
        print(f"Epoch {ep}/{epochs} train_loss={total_loss/n:.4f}")

        if m["f1"] > best_f1:
            best_f1 = m["f1"]
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)

    return model


### Train LSTM

In [28]:
cfg.optimizer = "adam"
cfg.lr = 1e-3
cfg.epochs = 3
cfg.grad_clip_norm = 1.0  # very relevant for RNN/LSTM

lstm = LSTMClassifier(vocab_size=len(stoi), emb_dim=cfg.emb_dim, hidden=cfg.lstm_hidden,
                      layers=cfg.lstm_layers, dropout=cfg.dropout, bidir=False)
lstm = fit_seq(lstm, seq_train_loader, seq_val_loader, epochs=cfg.epochs)

yt, pt, ypred = eval_seq(lstm, seq_test_loader)
metrics_lstm_test = eval_binary(yt, ypred, "LSTM (test)")


SEQ val ep1: acc=0.9282 precision=0.7829 recall=0.1404 f1=0.2381
Epoch 1/3 train_loss=0.2686
SEQ val ep2: acc=0.9378 precision=0.7243 recall=0.3579 f1=0.4791
Epoch 2/3 train_loss=0.2038
SEQ val ep3: acc=0.9433 precision=0.7696 recall=0.4156 f1=0.5397
Epoch 3/3 train_loss=0.1711
LSTM (test): acc=0.9416 precision=0.7575 recall=0.3965 f1=0.5205


## BiLSTM + Attention

### Additive attention on token outputs

In [29]:
class BiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128, layers=1, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.lstm = nn.LSTM(
            emb_dim, hidden, num_layers=layers,
            batch_first=True, dropout=dropout if layers > 1 else 0.0,
            bidirectional=True
        )
        self.attn = nn.Linear(hidden * 2, 1)  # score each timestep
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden * 2, 1)

        nn.init.xavier_normal_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x, lens):
        emb = self.emb(x)

        packed = nn.utils.rnn.pack_padded_sequence(emb, lens.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # (B,T,H*2)

        # mask padding
        B, T, _ = out.shape
        mask = torch.arange(T, device=lens.device).unsqueeze(0) >= lens.unsqueeze(1)  # (B,T) True for pad

        scores = self.attn(out).squeeze(-1)  # (B,T)
        scores = scores.masked_fill(mask, -1e9)
        weights = torch.softmax(scores, dim=1)  # (B,T)

        context = torch.bmm(weights.unsqueeze(1), out).squeeze(1)  # (B,H*2)
        context = self.drop(context)
        logits = self.fc(context).squeeze(1)
        return logits


### Train attention model

In [30]:
attn_model = BiLSTMAttention(vocab_size=len(stoi), emb_dim=cfg.emb_dim, hidden=cfg.lstm_hidden,
                            layers=cfg.lstm_layers, dropout=cfg.dropout)
attn_model = fit_seq(attn_model, seq_train_loader, seq_val_loader, epochs=cfg.epochs)

yt, pt, ypred = eval_seq(attn_model, seq_test_loader)
metrics_attn_test = eval_binary(yt, ypred, "BiLSTM+Attention (test)", show_report=True)


SEQ val ep1: acc=0.9388 precision=0.7363 recall=0.3648 f1=0.4879
Epoch 1/3 train_loss=0.2363
SEQ val ep2: acc=0.9414 precision=0.7207 recall=0.4357 f1=0.5431
Epoch 2/3 train_loss=0.1832
SEQ val ep3: acc=0.9449 precision=0.7738 recall=0.4399 f1=0.5609
Epoch 3/3 train_loss=0.1651
BiLSTM+Attention (test): acc=0.9441 precision=0.7780 recall=0.4209 f1=0.5463
Confusion matrix:
[[18209   192]
 [  926   673]]
Classification report:
              precision    recall  f1-score   support

           0     0.9516    0.9896    0.9702     18401
           1     0.7780    0.4209    0.5463      1599

    accuracy                         0.9441     20000
   macro avg     0.8648    0.7052    0.7582     20000
weighted avg     0.9377    0.9441    0.9363     20000



## Autoencoder (MSE) on TF-IDF + classifier

### Autoencoder on TF-IDF vectors

In [31]:
class TfidfAutoencoder(nn.Module):
    def __init__(self, in_dim, bottleneck=256):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Linear(in_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, bottleneck),
            nn.ReLU()
        )
        self.dec = nn.Sequential(
            nn.Linear(bottleneck, 1024),
            nn.ReLU(),
            nn.Linear(1024, in_dim)
        )

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        z = self.enc(x)
        xhat = self.dec(z)
        return z, xhat


In [None]:
# Reduce the TF‑IDF feature count to keep RAM reasonable
tfidf_ae = TfidfVectorizer(max_features=20_000, ngram_range=(1,2), min_df=2)
Xtr_ae = tfidf_ae.fit_transform(X_train_p)
Xva_ae = tfidf_ae.transform(X_val_p)
Xte_ae = tfidf_ae.transform(X_test_p)

ae_train = SparseTfidfDataset(Xtr_ae, y_train)
ae_val   = SparseTfidfDataset(Xva_ae, y_val)
ae_test  = SparseTfidfDataset(Xte_ae, y_test)

ae_train_loader = DataLoader(ae_train, batch_size=256, shuffle=True)
ae_val_loader   = DataLoader(ae_val, batch_size=256, shuffle=False)
ae_test_loader  = DataLoader(ae_test, batch_size=256, shuffle=False)

in_dim = Xtr_ae.shape[1]


In [33]:
def train_autoencoder(ae, loader, epochs=3):
    ae = ae.to(device)
    opt = torch.optim.Adam(ae.parameters(), lr=1e-3, weight_decay=1e-5)
    loss_fn = nn.MSELoss()

    for ep in range(1, epochs+1):
        ae.train()
        total = 0.0
        n = 0
        for x, _ in loader:
            x = x.to(device)
            opt.zero_grad(set_to_none=True)
            z, xhat = ae(x)
            loss = loss_fn(xhat, x)
            loss.backward()
            opt.step()
            total += loss.item() * x.size(0)
            n += x.size(0)
        print(f"AE epoch {ep}/{epochs} mse={total/n:.6f}")

    return ae

ae = TfidfAutoencoder(in_dim=in_dim, bottleneck=256)
ae = train_autoencoder(ae, ae_train_loader, epochs=3)


AE epoch 1/3 mse=0.000050
AE epoch 2/3 mse=0.000050
AE epoch 3/3 mse=0.000050


### Encode TF-IDF through AE and train a classifier on embeddings

In [34]:
@torch.no_grad()
def encode_dataset(ae, loader):
    ae.eval()
    Z, Y = [], []
    for x, y in loader:
        x = x.to(device)
        z, _ = ae(x)
        Z.append(z.cpu().numpy())
        Y.append(y.numpy())
    return np.vstack(Z), np.concatenate(Y).astype(int)

Ztr, Ytr = encode_dataset(ae, ae_train_loader)
Zva, Yva = encode_dataset(ae, ae_val_loader)
Zte, Yte = encode_dataset(ae, ae_test_loader)

clf = LogisticRegression(max_iter=300, class_weight="balanced")
clf.fit(Ztr, Ytr)

pred = clf.predict(Zte)
metrics_ae_test = eval_binary(Yte, pred, "AE-Embeddings + LogReg (test)")


AE-Embeddings + LogReg (test): acc=0.9201 precision=0.0000 recall=0.0000 f1=0.0000


## Final comparison table - all models

In [35]:
rows = []
def add(name, metrics):
    rows.append({"model": name, **metrics})

add("BOW+LogReg", metrics_bow_test)
add("TFIDF+LogReg", metrics_logreg_test)
add("TFIDF+MLP", metrics_mlp_test)
add("LSTM", metrics_lstm_test)
add("BiLSTM+Attention", metrics_attn_test)
add("AE+LogReg", metrics_ae_test)

pd.DataFrame(rows).sort_values("f1", ascending=False)

Unnamed: 0,model,acc,precision,recall,f1
0,BOW+LogReg,0.91925,0.49637,0.684178,0.575335
1,TFIDF+LogReg,0.908,0.45542,0.769856,0.572292
2,TFIDF+MLP,0.93995,0.665282,0.500938,0.571531
4,BiLSTM+Attention,0.9441,0.778035,0.420888,0.546266
3,LSTM,0.9416,0.757467,0.396498,0.520525
5,AE+LogReg,0.92005,0.0,0.0,0.0
