In [2]:
from __future__ import annotations

import json
import pathlib
from dataclasses import dataclass, asdict
from typing import List

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader, random_split

In [3]:
print(f"MPS available: {torch.backends.mps.is_available()}")

MPS available: True


In [40]:
#  Hyper‑parameters
MAX_LEN      = 32
EMBED_DIM    = 32
NUM_FILTERS  = 64
KERNEL_SIZES = (3, 4, 5)
BATCH_SIZE   = 128
EPOCHS       = 15
LR           = 3e-4
DATA_DIR     = pathlib.Path("dataset")
DEVICE       = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [41]:
#CHECKPOINT
@dataclass
class Checkpoint:
    vocab: dict
    state_dict: dict
    hyper_params: dict

def save_checkpoint(fp: str, model: nn.Module, vocab: dict, **hparams) -> None:
    ckpt = Checkpoint(vocab, model.state_dict(), hparams)
    torch.save(asdict(ckpt), fp)
    print(f"✓ checkpoint saved → {fp}")

def load_checkpoint(fp: str, device: str = "cpu"):
    data = torch.load(fp, map_location=device)
    vocab = data["vocab"]
    model = CharCNNClassifier(len(vocab) + 1).to(device)
    model.load_state_dict(data["state_dict"])
    model.eval()
    return model, vocab, data.get("hyper_params", {})

In [42]:
def read_lines(path: pathlib.Path) -> List[str]:
    with path.open(encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

# Load raw strings
ads      = read_lines(DATA_DIR / "is_ad_combines_final.txt")
non_ads  = read_lines(DATA_DIR / "non_ad.txt")
all_strs = ads + non_ads
all_lbls = [1] * len(ads) + [0] * len(non_ads)

# Build vocabulary (character‑level, lower‑cased)
chars      = sorted({c.lower() for c in "".join(all_strs)})
char2idx   = {ch: i + 1 for i, ch in enumerate(chars)}  # 0 = PAD/UNK
vocab_size = len(char2idx) + 1


In [43]:
def encode(s: str, max_len: int = MAX_LEN) -> torch.Tensor:
    idxs = [char2idx.get(c.lower(), 0) for c in s][:max_len]
    idxs += [0] * (max_len - len(idxs))
    return torch.tensor(idxs, dtype=torch.long)

In [44]:
class AdDataset(Dataset):
    def __init__(self, strings: List[str], labels: List[int]):
        self.strings, self.labels = strings, labels

    def __len__(self):
        return len(self.strings)

    def __getitem__(self, idx):
        return encode(self.strings[idx]), torch.tensor(self.labels[idx], dtype=torch.float32)

In [45]:
#  Model definition – Character CNN

class CharCNNClassifier(nn.Module):
    def __init__(self, vocab: int):
        super().__init__()
        self.embed = nn.Embedding(vocab, EMBED_DIM, padding_idx=0)
        self.convs = nn.ModuleList(
            nn.Conv1d(EMBED_DIM, NUM_FILTERS, k) for k in KERNEL_SIZES
        )
        self.classifier = nn.Sequential(
            nn.Linear(NUM_FILTERS * len(KERNEL_SIZES), 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: (B, L)
        x = self.embed(x).transpose(1, 2)                # (B, E, L)
        feats = [torch.relu(conv(x)).max(dim=2).values for conv in self.convs]
        x = torch.cat(feats, dim=1)
        return self.classifier(x).squeeze(1)             # (B,)

In [49]:
#  Training & evaluation helpers

def run_epoch(loader: DataLoader, model: nn.Module, criterion, optimizer=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()
    total_loss, total_correct, n = 0.0, 0, 0
    with torch.set_grad_enabled(is_train):
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            logits = model(x)
            loss = criterion(logits, y)
            if is_train:
                optimizer.zero_grad(); loss.backward(); optimizer.step()
            total_loss += loss.item() * y.size(0)
            preds = (torch.sigmoid(logits) >= 0.5)
            total_correct += (preds == y.bool()).sum().item()
            n += y.size(0)
    return total_loss / n, total_correct / n

def evaluate(model: nn.Module, loader: DataLoader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(DEVICE)
            probs = torch.sigmoid(model(x)).cpu()
            y_true.extend(y.numpy())
            y_pred.extend((probs >= 0.9).long().numpy())
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    acc = (torch.tensor(y_true) == torch.tensor(y_pred)).float().mean().item()
    return acc, prec, rec, f1

In [50]:
dataset = AdDataset(all_strs, all_lbls)

# 70/30 train‑test split (fixed seed for reproducibility)
test_len  = int(len(dataset) * 0.30)
train_len = len(dataset) - test_len
train_ds, test_ds = random_split(dataset, [train_len, test_len],
                                 generator=torch.Generator().manual_seed(42))
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

model = CharCNNClassifier(vocab_size).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = run_epoch(train_dl, model, criterion, optimizer)
    print(f"epoch {epoch:2d} | train loss {tr_loss:.4f} | train acc {tr_acc:.3f}")

# save final model
save_checkpoint(
    "models/model_v5.3.pth",
    model,
    vocab=char2idx,
    max_len=MAX_LEN,
    embed_dim=EMBED_DIM,
    num_filters=NUM_FILTERS,
    kernel_sizes=KERNEL_SIZES,
)

# evaluate on held‑out test set
acc, prec, rec, f1 = evaluate(model, test_dl)
print("\n🧪 Test‑set metrics")
print(f"accuracy  : {acc:.3f}")
print(f"precision : {prec:.3f}")
print(f"recall    : {rec:.3f}")
print(f"F1‑score  : {f1:.3f}")

epoch  1 | train loss 0.3052 | train acc 0.879
epoch  2 | train loss 0.0549 | train acc 0.985
epoch  3 | train loss 0.0373 | train acc 0.989
epoch  4 | train loss 0.0289 | train acc 0.991
epoch  5 | train loss 0.0237 | train acc 0.993
epoch  6 | train loss 0.0204 | train acc 0.994
epoch  7 | train loss 0.0170 | train acc 0.995
epoch  8 | train loss 0.0139 | train acc 0.996
epoch  9 | train loss 0.0125 | train acc 0.996
epoch 10 | train loss 0.0097 | train acc 0.998
epoch 11 | train loss 0.0080 | train acc 0.998
epoch 12 | train loss 0.0064 | train acc 0.999
epoch 13 | train loss 0.0055 | train acc 0.999
epoch 14 | train loss 0.0046 | train acc 0.999
epoch 15 | train loss 0.0035 | train acc 0.999
✓ checkpoint saved → models/model_v5.3.pth

🧪 Test‑set metrics
accuracy  : 0.991
precision : 0.996
recall    : 0.983
F1‑score  : 0.989


In [51]:
"""
We want to maximize precision since we do not want any false positives
v5: 0.993
v5.1: 0.992
v5.2: 0.995
v5.3: 0.996 with 0.9 threshold
v5.4: 0.997 ^
"""

'\nWe want to maximize precision since we do not want any false positives\nv5: 0.993\nv5.1: 0.992\nv5.2: 0.995\nv5.3: 0.996 with 0.9 threshold\n'