In [3]:
"""
BOW + MLP overfitting experiment for Hungarian Terms and Conditions readability prediction.
Goal: Overfit on a single batch (32 samples) to verify the model can learn.
"""

from typing import Tuple, List, Optional
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import re


class HungarianBOWVectorizer:
    """Simple Bag-of-Words vectorizer for Hungarian text."""

    def __init__(self, max_features: int = 500, min_freq: int = 1) -> None:
        self.max_features = max_features
        self.min_freq = min_freq
        self.vocab: dict[str, int] = {}
        self.vocab_size: int = 0

    def _tokenize(self, text: str) -> List[str]:
        """Simple whitespace + punctuation tokenizer."""
        text = text.lower()
        text = re.sub(r"[^\w\sáéíóöőúüű]", " ", text)
        tokens = text.split()
        return [t for t in tokens if len(t) > 1]

    def fit(self, texts: List[str]) -> "HungarianBOWVectorizer":
        """Build vocabulary from texts."""
        word_counts: Counter = Counter()
        for text in texts:
            tokens = self._tokenize(text)
            word_counts.update(tokens)

        filtered_words = [
            (word, count)
            for word, count in word_counts.items()
            if count >= self.min_freq
        ]
        sorted_words = sorted(filtered_words, key=lambda x: -x[1])
        top_words = sorted_words[: self.max_features]

        self.vocab = {word: idx for idx, (word, _) in enumerate(top_words)}
        self.vocab_size = len(self.vocab)
        return self

    def transform(self, texts: List[str]) -> np.ndarray:
        """Transform texts to BOW vectors."""
        vectors = np.zeros((len(texts), self.vocab_size), dtype=np.float32)
        for i, text in enumerate(texts):
            tokens = self._tokenize(text)
            for token in tokens:
                if token in self.vocab:
                    vectors[i, self.vocab[token]] += 1
        # L2 normalize
        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
        norms = np.where(norms == 0, 1, norms)
        vectors = vectors / norms
        return vectors

    def fit_transform(self, texts: List[str]) -> np.ndarray:
        """Fit and transform in one step."""
        self.fit(texts)
        return self.transform(texts)


class ReadabilityMLP(nn.Module):
    """Small MLP for readability classification (~6k parameters target)."""

    def __init__(self, input_dim: int, hidden_dim: int = 12, num_classes: int = 5) -> None:
        super().__init__()
        # Architecture: input -> hidden -> output
        # Parameters: input_dim * hidden_dim + hidden_dim (bias) + hidden_dim * num_classes + num_classes (bias)
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


def count_parameters(model: nn.Module) -> int:
    """Count trainable parameters."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def calculate_architecture(vocab_size: int, target_params: int = 6000, num_classes: int = 5) -> int:
    """Calculate hidden dimension to achieve target parameter count.

    Parameters = vocab_size * hidden + hidden + hidden * num_classes + num_classes
               = hidden * (vocab_size + 1 + num_classes) + num_classes
    """
    hidden = (target_params - num_classes) // (vocab_size + 1 + num_classes)
    return max(4, hidden)


def load_data(path: str, batch_size: int = 32) -> Tuple[List[str], np.ndarray]:
    """Load and sample data for overfitting experiment."""
    df = pd.read_csv(path)
    print(f"Total samples in dataset: {len(df)}")
    print(f"Label distribution:\n{df['label_numeric'].value_counts().sort_index()}")

    # Sample a batch, trying to get some variety in labels
    if len(df) > batch_size:
        # Stratified-ish sampling: take some from each class if possible
        sampled_indices: List[int] = []
        for label in sorted(df["label_numeric"].unique()):
            label_df = df[df["label_numeric"] == label]
            n_samples = min(len(label_df), batch_size // 5 + 1)
            sampled_indices.extend(label_df.sample(n=n_samples, random_state=42).index.tolist())
        sampled_indices = sampled_indices[:batch_size]
        df_batch = df.loc[sampled_indices]
    else:
        df_batch = df.head(batch_size)

    print(f"\nBatch size: {len(df_batch)}")
    print(f"Batch label distribution:\n{df_batch['label_numeric'].value_counts().sort_index()}")

    texts = df_batch["text"].tolist()
    labels = df_batch["label_numeric"].values - 1  # Convert 1-5 to 0-4

    return texts, labels


def train_overfit(
    model: nn.Module,
    X: torch.Tensor,
    y: torch.Tensor,
    epochs: int = 1000,
    lr: float = 0.01,
    print_every: int = 100,
) -> List[float]:
    """Train model to overfit on a single batch."""
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    losses: List[float] = []
    model.train()

    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        if (epoch + 1) % print_every == 0 or epoch == 0:
            with torch.no_grad():
                predictions = outputs.argmax(dim=1)
                accuracy = (predictions == y).float().mean().item()
                print(f"Epoch {epoch + 1:4d} | Loss: {loss.item():.4f} | Accuracy: {accuracy * 100:.1f}%")

    return losses


def main() -> None:
    print("=" * 60)
    print("BOW + MLP Overfitting Experiment")
    print("Hungarian Terms and Conditions Readability Prediction")
    print("=" * 60)

    # Configuration
    DATA_PATH = "/content/train.csv"
    BATCH_SIZE = 32
    TARGET_PARAMS = 6000
    EPOCHS = 1000
    LEARNING_RATE = 0.01

    # Load data
    print("\n[1] Loading data...")
    texts, labels = load_data(DATA_PATH, BATCH_SIZE)

    # Create BOW features
    print("\n[2] Creating BOW features...")
    # Adjust max_features to control parameter count
    # Start with a reasonable guess, then adjust
    vectorizer = HungarianBOWVectorizer(max_features=500, min_freq=1)
    X_bow = vectorizer.fit_transform(texts)
    print(f"Vocabulary size: {vectorizer.vocab_size}")
    print(f"BOW shape: {X_bow.shape}")

    # Calculate hidden dim for target params
    hidden_dim = calculate_architecture(vectorizer.vocab_size, TARGET_PARAMS)
    print(f"\n[3] Building model with hidden_dim={hidden_dim}...")

    # Create model
    model = ReadabilityMLP(input_dim=vectorizer.vocab_size, hidden_dim=hidden_dim, num_classes=5)
    n_params = count_parameters(model)
    print(f"Model architecture: {vectorizer.vocab_size} -> {hidden_dim} -> 5")
    print(f"Total parameters: {n_params:,}")

    # If too far from target, adjust vocab size
    if abs(n_params - TARGET_PARAMS) > 1000:
        print(f"\nAdjusting vocab size to get closer to {TARGET_PARAMS} params...")
        # Solve for vocab_size: params ≈ vocab_size * hidden + hidden * 6
        # With hidden=12: params ≈ vocab_size * 12 + 72
        target_vocab = (TARGET_PARAMS - 72) // 12 - 6
        vectorizer = HungarianBOWVectorizer(max_features=target_vocab, min_freq=1)
        X_bow = vectorizer.fit_transform(texts)

        hidden_dim = calculate_architecture(vectorizer.vocab_size, TARGET_PARAMS)
        model = ReadabilityMLP(input_dim=vectorizer.vocab_size, hidden_dim=hidden_dim, num_classes=5)
        n_params = count_parameters(model)
        print(f"Adjusted vocabulary size: {vectorizer.vocab_size}")
        print(f"Model architecture: {vectorizer.vocab_size} -> {hidden_dim} -> 5")
        print(f"Total parameters: {n_params:,}")

    # Convert to tensors
    X_tensor = torch.tensor(X_bow, dtype=torch.float32)
    y_tensor = torch.tensor(labels, dtype=torch.long)

    # Train
    print(f"\n[4] Training for {EPOCHS} epochs...")
    print("-" * 50)
    losses = train_overfit(model, X_tensor, y_tensor, epochs=EPOCHS, lr=LEARNING_RATE)

    # Final evaluation
    print("-" * 50)
    print("\n[5] Final evaluation on training batch:")
    model.eval()
    with torch.no_grad():
        outputs = model(X_tensor)
        predictions = outputs.argmax(dim=1)
        accuracy = (predictions == y_tensor).float().mean().item()
        print(f"Final accuracy: {accuracy * 100:.1f}%")

        # Per-class accuracy
        print("\nPer-class performance:")
        for cls in range(5):
            mask = y_tensor == cls
            if mask.sum() > 0:
                cls_acc = (predictions[mask] == y_tensor[mask]).float().mean().item()
                print(f"  Class {cls + 1}: {cls_acc * 100:.1f}% ({mask.sum().item()} samples)")
            else:
                print(f"  Class {cls + 1}: N/A (0 samples)")

    # Check if overfit succeeded
    print("\n" + "=" * 60)
    if accuracy >= 0.95:
        print("✓ SUCCESS: Model successfully overfit on the batch!")
    elif accuracy >= 0.7:
        print("~ PARTIAL: Model learned something but didn't fully overfit.")
    else:
        print("✗ FAILED: Model couldn't overfit on this batch.")
    print("=" * 60)


if __name__ == "__main__":
    main()

BOW + MLP Overfitting Experiment
Hungarian Terms and Conditions Readability Prediction

[1] Loading data...
Total samples in dataset: 2906
Label distribution:
label_numeric
1    132
2    318
3    634
4    925
5    897
Name: count, dtype: int64

Batch size: 32
Batch label distribution:
label_numeric
1    7
2    7
3    7
4    7
5    4
Name: count, dtype: int64

[2] Creating BOW features...
Vocabulary size: 500
BOW shape: (32, 500)

[3] Building model with hidden_dim=11...
Model architecture: 500 -> 11 -> 5
Total parameters: 5,571

[4] Training for 1000 epochs...
--------------------------------------------------
Epoch    1 | Loss: 1.6449 | Accuracy: 21.9%
Epoch  100 | Loss: 0.0061 | Accuracy: 100.0%
Epoch  200 | Loss: 0.0019 | Accuracy: 100.0%
Epoch  300 | Loss: 0.0010 | Accuracy: 100.0%
Epoch  400 | Loss: 0.0006 | Accuracy: 100.0%
Epoch  500 | Loss: 0.0004 | Accuracy: 100.0%
Epoch  600 | Loss: 0.0003 | Accuracy: 100.0%
Epoch  700 | Loss: 0.0002 | Accuracy: 100.0%
Epoch  800 | Loss: 0.00