In [1]:
import torch
import torch.nn as nn
from typing import Optional

In [2]:
class HTMLAdClassfier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        tag_vocab_size: int,
        attr_vocab_size: int,
        embed_dim: int = 256,
        num_layers: int = 4,
        num_heads: int = 8,
        dropout: float = 0.2
        max_seq_len: int = 1024,
    ) -> None:
    

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3784719236.py, line 10)

In [3]:
import torch
import torch.nn as nn
from typing import Optional

class HTMLAdClassifier(nn.Module):
    """Neural network that classifies each HTML token (or element start tag) as ad / non‑ad.

    The model is intentionally conservative: a sigmoid output and a tunable probability
    threshold allow you to bias toward *no* prediction rather than a false positive.
    """

    def __init__(
        self,
        vocab_size: int,
        tag_vocab_size: int,
        attr_vocab_size: int,
        embed_dim: int = 256,
        num_layers: int = 4,
        num_heads: int = 8,
        dropout: float = 0.2,
        max_seq_len: int = 1024,
    ) -> None:
        super().__init__()

        # ──────────────────── Embedding blocks ────────────────────
        self.token_embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.tag_embed = nn.Embedding(tag_vocab_size, embed_dim, padding_idx=0)
        self.attr_embed = nn.Embedding(attr_vocab_size, embed_dim, padding_idx=0)
        self.pos_embed = nn.Embedding(max_seq_len, embed_dim)
        self.embed_dropout = nn.Dropout(dropout)

        # ──────────────────── Transformer encoder ────────────────────
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        # ──────────────────── Classification head ────────────────────
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, 1),  # logit
        )

    def forward(
        self,
        token_ids: torch.LongTensor,      # (B, L)
        tag_ids: torch.LongTensor,        # (B, L)
        attr_ids: torch.LongTensor,       # (B, L)
        pos_ids: torch.LongTensor,        # (B, L)
        attention_mask: Optional[torch.BoolTensor] = None,  # (B, L)
    ) -> torch.Tensor:
        """Return per‑token logits (before sigmoid)."""
        x = (
            self.token_embed(token_ids)
            + self.tag_embed(tag_ids)
            + self.attr_embed(attr_ids)
            + self.pos_embed(pos_ids)
        )
        x = self.embed_dropout(x)

        x = self.encoder(x, src_key_padding_mask=attention_mask)
        logits = self.classifier(x).squeeze(-1)  # (B, L)
        return logits

    # -------------------------------------------------------------
    # Convenience helpers
    # -------------------------------------------------------------
    @staticmethod
    def probability(logits: torch.Tensor) -> torch.Tensor:
        """Convert logits to probabilities with a numerically stable sigmoid."""
        return torch.sigmoid(logits)

    @staticmethod
    def prediction(logits: torch.Tensor, threshold: float = 0.9) -> torch.Tensor:
        """Return boolean mask of predictions above threshold.
        A high default threshold keeps false‑positives low.
        """
        return torch.sigmoid(logits) > threshold
