In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base/transformers/default/1/rust_model.ot
/kaggle/input/roberta-base/transformers/default/1/config.json
/kaggle/input/roberta-base/transformers/default/1/merges.txt
/kaggle/input/roberta-base/transformers/default/1/README.md
/kaggle/input/roberta-base/transformers/default/1/tokenizer.json
/kaggle/input/roberta-base/transformers/default/1/vocab.json
/kaggle/input/roberta-base/transformers/default/1/tf_model.h5
/kaggle/input/roberta-base/transformers/default/1/tokenizer_config.json
/kaggle/input/roberta-base/transformers/default/1/dict.txt
/kaggle/input/roberta-base/transformers/default/1/pytorch_model.bin
/kaggle/input/roberta-base/transformers/default/1/model.safetensors
/kaggle/input/roberta-base/transformers/default/1/flax_model.msgpack
/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv
/kaggle/input/jigsaw/subreddits.csv
/kaggle/input/jigsaw/fe

# utils.management


In [None]:
import gc
import inspect
import logging
import os
import sys
import time
import traceback
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Union

import psutil
import torch


def clean_mem():
    # import gc
    # import os
    # import sys
    # import time
    # import traceback

    # import psutil
    # import torch

    process = psutil.Process(os.getpid())

    # Measure RAM before cleanup
    ram_before = process.memory_info().rss / (1024**2)  # in MB

    # Measure GPU before cleanup
    if torch.cuda.is_available():
        gpu_alloc_before = torch.cuda.memory_allocated() / (1024**2)  # in MB
        gpu_reserved_before = torch.cuda.memory_reserved() / (1024**2)  # in MB
    else:
        gpu_alloc_before = gpu_reserved_before = 0

    # clean all traceback
    if hasattr(sys, "last_traceback"):
        traceback.clear_frames(sys.last_traceback)
        delattr(sys, "last_traceback")
    if hasattr(sys, "last_type"):
        delattr(sys, "last_type")
    if hasattr(sys, "last_value"):
        delattr(sys, "last_value")

    # clean all ipython history
    if "get_ipython" in globals():
        try:
            from IPython import get_ipython

            ip = get_ipython()
            user_ns = ip.user_ns
            ip.displayhook.flush()
            pc = ip.displayhook.prompt_count + 1
            for n in range(1, pc):
                user_ns.pop("_i" + repr(n), None)
            user_ns.update(dict(_i="", _ii="", _iii=""))
            hm = ip.history_manager
            hm.input_hist_parsed[:] = [""] * pc
            hm.input_hist_raw[:] = [""] * pc
            hm._i = hm._ii = hm._iii = hm._i00 = ""
        except Exception as e:
            print("ipython mem could not be cleared")

    # do a garbage collection and flush cuda cache
    gc.collect()
    torch.cuda.empty_cache()

    # Give system a small moment to settle (helps RAM measurement be more accurate)
    time.sleep(0.1)

    # Measure RAM after cleanup
    ram_after = process.memory_info().rss / (1024**2)  # in MB

    # Measure GPU after cleanup
    if torch.cuda.is_available():
        gpu_alloc_after = torch.cuda.memory_allocated() / (1024**2)  # in MB
        gpu_reserved_after = torch.cuda.memory_reserved() / (1024**2)  # in MB
    else:
        gpu_alloc_after = gpu_reserved_after = 0

    # Report freed memory
    print(
        f"RAM freed: {ram_before - ram_after:.2f} MB ({ram_before:.2f} -> {ram_after:.2f})"
    )
    if torch.cuda.is_available():
        print(
            f"GPU allocated freed: {gpu_alloc_before - gpu_alloc_after:.2f} MB ({gpu_alloc_before:.2f} -> {gpu_alloc_after:.2f})"
        )
        print(
            f"GPU reserved freed: {gpu_reserved_before - gpu_reserved_after:.2f} MB ({gpu_reserved_before:.2f} -> {gpu_reserved_after:.2f})"
        )
    else:
        print("No GPU detected.")


def create_logger(
    name: str = "reddit_moderation",
    log_level: str = "INFO",
    log_file: Optional[Union[str, Path]] = None,
    log_dir: Optional[Union[str, Path]] = "logs",
    console_output: bool = True,
    file_output: bool = True,
    format_string: Optional[str] = None,
    max_bytes: int = 10_000_000,  # 10MB
    backup_count: int = 5,
    include_timestamp_in_filename: bool = True,
) -> logging.Logger:
    """
    Create a fully featured logger for the Reddit comment moderation system.

    This logger is designed to handle all aspects of the multi-stage classification
    pipeline including zero-shot classification, fine-tuning, and evaluation.

    Parameters
    ----------
    name : str, optional
        Logger name, by default "reddit_moderation"
    log_level : str, optional
        Logging level ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"),
        by default "INFO"
    log_file : str or Path, optional
        Specific log file path. If None, auto-generates based on name and timestamp
    log_dir : str or Path, optional
        Directory for log files, by default "logs"
    console_output : bool, optional
        Whether to output logs to console, by default True
    file_output : bool, optional
        Whether to output logs to file, by default True
    format_string : str, optional
        Custom log format string, by default None (uses comprehensive format)
    max_bytes : int, optional
        Maximum log file size before rotation, by default 10MB
    backup_count : int, optional
        Number of backup log files to keep, by default 5
    include_timestamp_in_filename : bool, optional
        Whether to include timestamp in log filename, by default True

    Returns
    -------
    logging.Logger
        Configured logger instance ready for use

    Examples
    --------
    >>> # Basic usage
    >>> logger = create_logger()
    >>> logger.info("Starting Reddit comment classification pipeline")

    >>> # Advanced usage for training
    >>> training_logger = create_logger(
    ...     name="distilbert_training",
    ...     log_level="DEBUG",
    ...     log_file="training_session.log"
    ... )
    >>> training_logger.debug("Training batch processed")

    >>> # For evaluation only
    >>> eval_logger = create_logger(
    ...     name="model_evaluation",
    ...     console_output=False,
    ...     log_file="evaluation_results.log"
    ... )
    """

    # Create logger
    logger = logging.getLogger(name)
    logger.setLevel(getattr(logging, log_level.upper()))

    # Clear existing handlers to avoid duplication
    logger.handlers.clear()

    # Default comprehensive format for ML workflows
    if format_string is None:
        format_string = "%(asctime)s | %(name)s | %(levelname)s | %(message)s"

    formatter = logging.Formatter(format_string, datefmt="%Y-%m-%d %H:%M:%S")

    # Console handler
    if console_output:
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(getattr(logging, log_level.upper()))
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

    # File handler with rotation
    if file_output:
        # Create log directory
        if log_dir:
            log_dir = Path(log_dir)
            log_dir.mkdir(exist_ok=True)

        # Generate log filename
        if log_file is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            if include_timestamp_in_filename:
                log_filename = f"{name}_{timestamp}.log"
            else:
                log_filename = f"{name}.log"
            log_file = log_dir / log_filename if log_dir else Path(log_filename)
        else:
            log_file = Path(log_file)
            if log_dir and not log_file.is_absolute():
                log_file = Path(log_dir) / log_file

        # Create rotating file handler
        from logging.handlers import RotatingFileHandler

        file_handler = RotatingFileHandler(
            log_file, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8"
        )
        file_handler.setLevel(getattr(logging, log_level.upper()))
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    # Add some useful methods to the logger
    def log_dataset_info(dataset, dataset_name="Dataset"):
        """Log dataset information"""
        logger.info(f"{dataset_name} Info:")
        logger.info(f"  - Size: {len(dataset):,} samples")
        logger.info(f"  - Columns: {dataset.column_names}")
        if "labels" in dataset.column_names:
            import numpy as np

            labels = np.array(dataset["labels"])
            unique, counts = np.unique(labels, return_counts=True)
            logger.info(f"  - Label distribution: {dict(zip(unique, counts))}")

    def log_model_info(model, model_name="Model"):
        """Log model information"""
        logger.info(f"{model_name} Info:")
        if hasattr(model, "config"):
            logger.info(f"  - Model type: {model.config.model_type}")
            logger.info(f"  - Hidden size: {model.config.hidden_size}")
            if hasattr(model.config, "num_labels"):
                logger.info(f"  - Number of labels: {model.config.num_labels}")

        # Count parameters
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        logger.info(f"  - Total parameters: {total_params:,}")
        logger.info(f"  - Trainable parameters: {trainable_params:,}")

    def log_training_args(training_args):
        """Log training arguments"""
        logger.info("Training Configuration:")
        logger.info(f"  - Learning rate: {training_args.learning_rate}")
        logger.info(f"  - Batch size: {training_args.per_device_train_batch_size}")
        logger.info(
            f"  - Gradient accumulation: {training_args.gradient_accumulation_steps}"
        )
        logger.info(f"  - Epochs: {training_args.num_train_epochs}")
        logger.info(f"  - Weight decay: {training_args.weight_decay}")
        logger.info(f"  - LR scheduler: {training_args.lr_scheduler_type}")
        logger.info(f"  - Warmup ratio: {training_args.warmup_ratio}")

    def log_metrics(metrics, stage=""):
        """Log evaluation metrics"""
        stage_prefix = f"{stage} " if stage else ""
        logger.info(f"{stage_prefix}Metrics:")
        for metric, value in metrics.items():
            if isinstance(value, float):
                logger.info(f"  - {metric}: {value:.4f}")
            else:
                logger.info(f"  - {metric}: {value}")

    # Attach utility methods to logger
    logger.log_dataset_info = log_dataset_info
    logger.log_model_info = log_model_info
    logger.log_training_args = log_training_args
    logger.log_metrics = log_metrics

    # Log logger creation
    logger.info(f"Logger '{name}' created successfully")
    logger.info(f"Log level: {log_level}")
    if file_output:
        logger.info(f"Log file: {log_file}")

    return logger


# Convenience function for quick setup
def setup_project_logging(debug_mode: bool = False) -> logging.Logger:
    """
    Quick setup for the Reddit moderation project logging.

    Parameters
    ----------
    debug_mode : bool
        If True, sets log level to DEBUG and enables verbose logging

    Returns
    -------
    logging.Logger
        Configured project logger
    """
    log_level = "DEBUG" if debug_mode else "INFO"

    return create_logger(
        name="reddit_moderation_pipeline",
        log_level=log_level,
        log_dir="project_logs",
        include_timestamp_in_filename=True,
    )


def get_ram_usage():
    process = psutil.Process()
    return process.memory_info().rss  # bytes


def free_vars(
    vars_to_delete: List[Union[str, object]],
    namespace: Optional[dict] = None,
    try_gpu: bool = True,
    logger=None,
):
    """
    Deletes variables by name or reference, frees RAM and GPU (PyTorch) memory,
    logs actions via logger if provided.

    Args:
      vars_to_delete: list of variable names (str) or object refs
      namespace: dict to remove names from (defaults to caller's globals())
      try_gpu: clear GPU memory for torch objects
      logger: logging object or None (use print)
    Returns:
      (freed_ram_bytes, freed_gpu_bytes)
    """
    # Setup logger if not provided
    if logger is None:

        def logger(msg):
            print(msg)

    else:
        logger = logger.info

    # Automatic namespace resolution
    if namespace is None:
        # Get frame of the caller, locals then globals
        frame = inspect.currentframe().f_back
        namespace = frame.f_globals

    before_ram = get_ram_usage()
    try:
        import torch
    except ImportError:
        torch = None

    freed_gpu_bytes = 0
    torch_objs = []
    deleted = []

    for var in vars_to_delete:
        if isinstance(var, str):
            obj = namespace.get(var, None)
            if obj is not None:
                deleted.append(var)
                if torch and try_gpu:
                    torch_objs.append(obj)
                del namespace[var]
                logger(f"Deleted variable '{var}'")
            else:
                logger(f"Variable '{var}' not found in namespace")
        else:
            # Try to remove all names referencing the object
            names = [n for n, v in namespace.items() if v is var]
            for n in names:
                del namespace[n]
                deleted.append(n)
                logger(f"Deleted variable '{n}' (by reference)")
            if not names:
                logger(
                    f"Could not find a variable name for object {var!r}, may not be deleted"
                )
            if torch and try_gpu:
                torch_objs.append(var)

    if torch and try_gpu and torch_objs and torch.cuda.is_available():
        before_gpu = torch.cuda.memory_allocated()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        after_gpu = torch.cuda.memory_allocated()
        freed_gpu_bytes = after_gpu - before_gpu
        logger(f"GPU memory freed: {freed_gpu_bytes/(1024**2):.2f} MB")
    # Always run gc
    gc.collect()
    after_ram = get_ram_usage()
    freed_ram_bytes = after_ram - before_ram
    logger(f"RAM memory freed: {freed_ram_bytes/(1024**2):.2f} MB")
    clean_mem()
    # return freed_ram_bytes, freed_gpu_bytes

# utils.preprocess


In [None]:
import heapq
import logging
import re
from functools import partial
from typing import Optional

import markdown2
import pandas as pd
from bs4 import BeautifulSoup
from datasets import Dataset
from unidecode import unidecode


def sanitize_comment(comment):
    # Convert markdown to HTML, then extract the text (HTML tags removed)
    html = markdown2.markdown(comment)
    text = BeautifulSoup(html, features="html.parser").get_text()

    # Convert markdown links [text](url) to just "text"
    # Must be done on original comment, but here we do it on extracted text to be safe
    # To be sure, you can do it before markdown conversion, but here kept as is for simplicity

    # The markdown2 conversion often converts markdown links into HTML anchors,
    # so links should already have URL removed by BeautifulSoup.get_text().
    # However, just in case, let's remove leftover markdown links from original comment first:
    text = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"\1", comment)
    # Then re-run markdown2 and extract text again to clean up
    html = markdown2.markdown(text)
    text = BeautifulSoup(html, features="html.parser").get_text()

    # Replace URLs with the url itself as plain text
    # Extract URLs and replace markdown-style inline URLs [text](url) with url only is already handled,
    # but explicit URLs just in text should be replaced with the URL string, not removed.
    # For example: "visit http://example.com for more" should keep "http://example.com" as is.
    # So to convert URL markdown to plain URLs requires us to find URLs and keep them as text.

    # Here, let's find URLs and replace any markdown link forms to plain URLs if any missed:
    # But since markdown2 and BeautifulSoup stripped them to plain text, raw URLs remain intact.

    # So no need to remove URLs, but ensure any URLs embedded in text like "https://..." remain
    # We can optionally extract and re-insert URLs if you want, but seems not required.

    # Just to be sure, let's convert all URL-like substrings to themselves surrounded by spaces (to separate)
    # This helps if URLs are concatenated with other text.
    url_pattern = re.compile(r"((?:http|https)://[^\s]+|www\.[^\s]+)", re.IGNORECASE)
    text = url_pattern.sub(lambda m: m.group(0), text)

    # Convert non-unicode characters to unicode (ASCII compatible)
    text = unidecode(text)

    # Normalize whitespace
    text = " ".join(text.split()).lower()

    return text


def create_master_dataset(
    train: pd.DataFrame,
    test: pd.DataFrame,
    logger: Optional[logging.Logger] = None,
    positive_examples_to_consider: list = [1, 2],
    negative_example_to_consider: list = [1, 2],
    return_train_main_as_validation: bool = True,
) -> pd.DataFrame:
    if logger:
        logger.info("Starting master dataset creation")
        logger.info(f"Input - Train: {len(train)} rows, Test: {len(test)} rows")

    # 1. From train: use body
    if logger:
        logger.debug("Extracting main training data from 'body' column")
    train_main = train[["body", "rule", "subreddit", "rule_violation"]].copy()
    train_main = train_main.rename(
        columns={"body": "comment", "rule_violation": "violation"}
    )
    if logger:
        logger.debug(f"Main training data: {len(train_main)} records")

    # 2. From train AND test: from positive/negative examples

    # Helper to melt examples from a single dataframe
    def extract_examples(
        df, prefix_pos="positive_example_", prefix_neg="negative_example_"
    ):
        records = []

        # For positive examples
        for i in positive_examples_to_consider:
            col = f"{prefix_pos}{i}"
            # Ensure column exists and drop NA
            if col in df.columns:
                subdf = df[["rule", "subreddit", col]].dropna(subset=[col])
                if logger:
                    logger.debug(
                        f"Extracting {len(subdf)} positive examples from {col}"
                    )
                for _, row in subdf.iterrows():
                    records.append(
                        {
                            "comment": row[col],
                            "rule": row["rule"],
                            "subreddit": row["subreddit"],
                            "violation": 1,
                        }
                    )

        # For negative examples
        for i in negative_example_to_consider:
            col = f"{prefix_neg}{i}"
            if col in df.columns:
                subdf = df[["rule", "subreddit", col]].dropna(subset=[col])
                if logger:
                    logger.debug(
                        f"Extracting {len(subdf)} negative examples from {col}"
                    )
                for _, row in subdf.iterrows():
                    records.append(
                        {
                            "comment": row[col],
                            "rule": row["rule"],
                            "subreddit": row["subreddit"],
                            "violation": 0,
                        }
                    )

        return pd.DataFrame(records)

    if logger:
        logger.debug("Extracting examples from train dataset")
    train_examples = extract_examples(train)
    if logger:
        logger.debug(f"Train examples extracted: {len(train_examples)} records")

    if logger:
        logger.debug("Extracting examples from test dataset")
    try:
        test_examples = extract_examples(test)
    except Exception as e:
        test_examples = []
    if logger:
        logger.debug(f"Test examples extracted: {len(test_examples)} records")

    # Concatenate all parts
    if logger:
        logger.info("Concatenating all dataset parts")
    files_to_concat = [train_examples]
    if len(test_examples) > 0:
        files_to_concat.append(test_examples)
    if not return_train_main_as_validation:
        files_to_concat.append(train_main)
    master_df = pd.concat(files_to_concat, ignore_index=True)

    # Optional: drop rows with empty or null comment if any sneaked in
    initial_size = len(master_df)
    master_df = master_df.dropna(subset=["comment"])
    if logger and len(master_df) < initial_size:
        logger.warning(
            f"Dropped {initial_size - len(master_df)} rows with null comments"
        )

    # Reset index for cleanliness
    master_df = master_df.reset_index(drop=True)

    if logger:
        logger.info(
            f"Master dataset created successfully: {len(master_df)} total records"
        )
        logger.info(
            f"Violation distribution: {master_df['violation'].value_counts().to_dict()}"
        )

    return master_df, train_main

# imports


In [4]:
import wandb

wandb.init(mode="disabled")

In [5]:
import gc
import os
import sys
import time
import warnings


import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import transformers
from datasets import Dataset

from scipy.special import softmax

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from torch import nn

from torch.utils import data
from torchinfo import summary

from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    pipeline,
    EarlyStoppingCallback,
    TrainerCallback,
)

2025-08-03 10:43:16.029411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754217796.385787      66 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754217796.486620      66 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
os.environ["WANDB_DISABLED"] = "true"
warnings.simplefilter("ignore")

In [7]:
logger = create_logger()
training_logger = create_logger(name="model", log_file="training.log")

2025-08-03 10:43:42 | reddit_moderation | INFO | Logger 'reddit_moderation' created successfully
2025-08-03 10:43:42 | reddit_moderation | INFO | Log level: INFO
2025-08-03 10:43:42 | reddit_moderation | INFO | Log file: logs/reddit_moderation_20250803_104342.log
2025-08-03 10:43:42 | model | INFO | Logger 'model' created successfully
2025-08-03 10:43:42 | model | INFO | Log level: INFO
2025-08-03 10:43:42 | model | INFO | Log file: logs/training.log


# load dataset


In [8]:
INPUT_PATH = os.path.join("/", "kaggle", "input")
# INPUT_PATH = os.path.join("..", "data")
train = pd.read_csv(
    os.path.join(INPUT_PATH, "jigsaw-agile-community-rules", "train.csv")
)
test = pd.read_csv(os.path.join(INPUT_PATH, "jigsaw-agile-community-rules", "test.csv"))
submission = pd.read_csv(
    os.path.join(INPUT_PATH, "jigsaw-agile-community-rules", "sample_submission.csv")
)
features = pd.read_csv(os.path.join(INPUT_PATH, "jigsaw", "features.csv"))[
    "features"
].tolist()
subreddits = pd.read_csv(os.path.join(INPUT_PATH, "jigsaw", "subreddits.csv"))[
    "subreddit"
].tolist()

# load models


In [None]:
_MODEL_VERSION_PATH = os.path.join(
    "transformers",
    "default",
    "1",
)
_MODEL_DIR = os.path.join("/", "kaggle", "input")
MODEL_PATH = {
    "classifier": os.path.join(_MODEL_DIR, "roberta-base", _MODEL_VERSION_PATH),
    "nli": os.path.join(
        _MODEL_DIR, "moritzlaurerdeberta-v3-base-mnli-fever-anli", _MODEL_VERSION_PATH
    ),
}

# _MODEL_DIR = os.path.join("..", "model")
# MODEL_PATH = {
#     "classifier-OLD": os.path.join(_MODEL_DIR, "facebookai-roberta-large-mnli"),
#     "nli": os.path.join(
#         _MODEL_DIR,
#         "nli-deberta-v3-small",
#     ),
#     "classifier": os.path.join(_MODEL_DIR, "roberta-base"),
# }
logger.info(MODEL_PATH)

2025-08-03 10:43:42 | reddit_moderation | INFO | {'classifier': '/kaggle/input/roberta-base/transformers/default/1', 'nli': '/kaggle/input/moritzlaurerdeberta-v3-base-mnli-fever-anli/transformers/default/1'}


# training dataset prep


In [None]:
# # make test shorter
# max_length_train = 5_000
# if len(test)>=max_length_train:
#     train, _ = train_test_split(
#         train, train_size=max_length_train,
#         random_state=42,
#         shuffle=True,
#         stratify=train["rule"]
#     )

# max_length_test = 20_000
# if len(test)>=max_length_test:
#     test, _ = train_test_split(
#         test, train_size=max_length_test,
#         random_state=42,
#         shuffle=True,
#         stratify=test["rule"]
#     )

In [11]:
for c in [
    "body",
    "rule",
    "subreddit",
    "positive_example_1",
    "positive_example_2",
    "negative_example_1",
    "negative_example_2",
]:
    logger.info(f"Cleaning {c = }")
    train[c] = train[c].apply(sanitize_comment)
    if c in test.columns:
        test[c] = test[c].apply(sanitize_comment)
master_dataset, val_dataset = create_master_dataset(
    train,
    test,
    logger,
    positive_examples_to_consider=[1],
    negative_example_to_consider=[1],
)
free_vars([train, test], logger=logger)

2025-08-03 10:43:42 | reddit_moderation | INFO | Cleaning c = 'body'
2025-08-03 10:43:43 | reddit_moderation | INFO | Cleaning c = 'rule'
2025-08-03 10:43:44 | reddit_moderation | INFO | Cleaning c = 'subreddit'
2025-08-03 10:43:45 | reddit_moderation | INFO | Cleaning c = 'positive_example_1'
2025-08-03 10:43:46 | reddit_moderation | INFO | Cleaning c = 'positive_example_2'
2025-08-03 10:43:47 | reddit_moderation | INFO | Cleaning c = 'negative_example_1'
2025-08-03 10:43:48 | reddit_moderation | INFO | Cleaning c = 'negative_example_2'
2025-08-03 10:43:49 | reddit_moderation | INFO | Starting master dataset creation
2025-08-03 10:43:49 | reddit_moderation | INFO | Input - Train: 2029 rows, Test: 10 rows
2025-08-03 10:43:49 | reddit_moderation | INFO | Concatenating all dataset parts
2025-08-03 10:43:49 | reddit_moderation | INFO | Master dataset created successfully: 4078 total records
2025-08-03 10:43:49 | reddit_moderation | INFO | Violation distribution: {1: 2039, 0: 2039}
2025-08

# modelling


In [12]:
# basemodel = get_custom_roberta(MODEL_PATH["classifier"])
basemodel = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH["classifier"])
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH["classifier"])
print(f"{tokenizer.pad_token = } | {tokenizer.eos_token = }")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/roberta-base/transformers/default/1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer.pad_token = '<pad>' | tokenizer.eos_token = '</s>'


In [13]:
def _make_prompt(row):
    prompt = f"""Rule: {row['rule']}

Comment: "{row['comment']}"

Question: Does this comment violate the rule?
Answer:"""
    return prompt


master_dataset["prompt"] = master_dataset.apply(_make_prompt, axis=1)
master_dataset = Dataset.from_pandas(master_dataset)

val_dataset["prompt"] = val_dataset.apply(_make_prompt, axis=1)
val_dataset = Dataset.from_pandas(val_dataset)

In [None]:
def preprocess_(batch):
    return tokenizer(batch["prompt"], padding="max_length", truncation=True)


tokenized_dataset = master_dataset.map(preprocess_, batched=True, batch_size=128)
tokenized_val = val_dataset.map(preprocess_, batched=True, batch_size=128)

training_logger.log_dataset_info(tokenized_dataset, "Master Dataset")
training_logger.log_dataset_info(tokenized_val, "Master Dataset")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

columns = ["input_ids", "attention_mask", "violation"]
tokenized_dataset.set_format(type="torch", columns=columns)
tokenized_dataset = tokenized_dataset.rename_column("violation", "label")

tokenized_val.set_format(type="torch", columns=columns)
tokenized_val = tokenized_val.rename_column("violation", "label")

free_vars(["master_dataset", "val_dataset"], namespace=globals(), logger=logger)

Map:   0%|          | 0/4078 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

2025-08-03 10:43:53 | model | INFO | Master Dataset Info:
2025-08-03 10:43:53 | model | INFO |   - Size: 4,078 samples
2025-08-03 10:43:53 | model | INFO |   - Columns: ['comment', 'rule', 'subreddit', 'violation', 'prompt', 'input_ids', 'attention_mask']
2025-08-03 10:43:53 | model | INFO | Master Dataset Info:
2025-08-03 10:43:53 | model | INFO |   - Size: 2,029 samples
2025-08-03 10:43:53 | model | INFO |   - Columns: ['comment', 'rule', 'subreddit', 'violation', 'prompt', 'input_ids', 'attention_mask']
2025-08-03 10:43:53 | reddit_moderation | INFO | Deleted variable 'master_dataset'
2025-08-03 10:43:53 | reddit_moderation | INFO | Deleted variable 'val_dataset'
2025-08-03 10:43:53 | reddit_moderation | INFO | GPU memory freed: 0.00 MB
2025-08-03 10:43:54 | reddit_moderation | INFO | RAM memory freed: 0.00 MB
RAM freed: 0.00 MB (1607.31 -> 1607.31)
GPU allocated freed: 0.00 MB (0.00 -> 0.00)
GPU reserved freed: 0.00 MB (0.00 -> 0.00)


## define metrics


In [None]:
from scipy.special import softmax
from sklearn.metrics import roc_auc_score

previous_auc = 0


def compute_metrics(eval_pred):
    global previous_auc
    logits, labels = eval_pred

    # Compute softmax probabilities and predictions
    probs = softmax(logits, axis=1)
    preds = np.argmax(probs, axis=1)

    # Compute classification metrics
    auc = roc_auc_score(y_true=labels, y_score=probs[:, 1])
    acc = accuracy_score(labels, preds)

    # Compute loss (CrossEntropyLoss expects torch tensors)
    loss_fn = nn.CrossEntropyLoss()
    loss = loss_fn(torch.tensor(logits), torch.tensor(labels)).item()

    metrics = {"auc": auc, "acc": acc, "loss": loss}
    improvement = auc - previous_auc
    previous_auc = auc
    logger.info(
        f"{auc = :.3f} | {acc*100 = :.1f} | {loss = :.2f} | {improvement = :.4f}"
    )
    return metrics

## finetune


In [None]:
DEVICE = torch.device("cuda")
from datasets import ClassLabel

# 1. Cast "rule" to ClassLabel
#    This infers the unique values in splits["train"]["rule"] and assigns integer IDs.
tokenized_dataset = tokenized_dataset.cast_column(
    "rule", ClassLabel(names=sorted(set(tokenized_dataset["rule"])))
)

# 2. Now do the stratified split
splits = tokenized_dataset.train_test_split(
    test_size=0.4, shuffle=True, seed=42, stratify_by_column="rule"
)

Casting the dataset:   0%|          | 0/4078 [00:00<?, ? examples/s]

In [17]:
os.makedirs("finetuned-roberta-binary", exist_ok=True)
training_args = TrainingArguments(
    # 1. Directory and checkpointing
    output_dir="finetuned-roberta-binary",
    # save_strategy="epoch",  # Save at end of each epoch
    save_total_limit=3,  # Keep only the 3 most recent checkpoints
    # 2. Learning rate and schedule
    learning_rate=2e-5,  # Common sweet spot for base models
    lr_scheduler_type="linear",  # Linear decay after warmup
    warmup_ratio=0.1,  # 10% of total steps for warmup
    # 3. Batch sizes and accumulation
    per_device_train_batch_size=8,  # 8 on 16–32 GB GPU; lower if memory constrained
    per_device_eval_batch_size=16,  # Larger eval batch for speed
    gradient_accumulation_steps=4,  # Achieves ~32 samples/effective batch
    # 4. Epochs and steps
    num_train_epochs=4,  # 3–5 epochs is usually sufficient
    max_steps=-1,  # Use epochs (not absolute steps)
    # 5. Optimizer settings
    weight_decay=0.01,  # Standard to regularize attention heads
    adam_beta1=0.9,
    adam_beta2=0.98,  # Slightly lower than default for stability
    adam_epsilon=1e-8,  # Default epsilon
    # 6. Precision and performance
    fp16=True,  # Mixed precision for speed & memory
    dataloader_pin_memory=True,  # Speed up CPU→GPU data transfer
    # 7. Evaluation & logging
    load_best_model_at_end=True,  # Automatically restore best checkpoint
    metric_for_best_model="auc",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=20,  # Log every 50 batches
    logging_first_step=True,
    report_to="none",  # Disable WandB/MLflow by default
    # 8. Misc
    seed=42,  # For reproducibility
    overwrite_output_dir=False,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=20,
    save_steps=1000,
)

In [None]:
trainer = Trainer(
    model=basemodel,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=1, early_stopping_threshold=0.001
        ),
    ],
)

In [19]:
clean_mem()

RAM freed: 0.00 MB (1615.55 -> 1615.55)
GPU allocated freed: 0.00 MB (476.73 -> 476.73)
GPU reserved freed: 0.00 MB (530.00 -> 530.00)


In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Auc,Acc
20,0.6925,0.687471,0.753526,0.545589
40,0.5807,0.52054,0.824683,0.747166
60,0.2752,0.62423,0.867622,0.78068
80,0.2067,0.517243,0.888644,0.818137
100,0.1546,0.534864,0.896204,0.825037
120,0.101,0.59534,0.897401,0.830951
140,0.0556,0.661762,0.893729,0.827501


2025-08-03 10:45:38 | reddit_moderation | INFO | auc = 0.754 | acc*100 = 54.6 | loss = 0.69 | improvement = 0.7535
2025-08-03 10:47:22 | reddit_moderation | INFO | auc = 0.825 | acc*100 = 74.7 | loss = 0.52 | improvement = 0.0712
2025-08-03 10:49:06 | reddit_moderation | INFO | auc = 0.868 | acc*100 = 78.1 | loss = 0.62 | improvement = 0.0429
2025-08-03 10:50:49 | reddit_moderation | INFO | auc = 0.889 | acc*100 = 81.8 | loss = 0.52 | improvement = 0.0210
2025-08-03 10:52:33 | reddit_moderation | INFO | auc = 0.896 | acc*100 = 82.5 | loss = 0.53 | improvement = 0.0076
2025-08-03 10:54:16 | reddit_moderation | INFO | auc = 0.897 | acc*100 = 83.1 | loss = 0.60 | improvement = 0.0012
2025-08-03 10:55:59 | reddit_moderation | INFO | auc = 0.894 | acc*100 = 82.8 | loss = 0.66 | improvement = -0.0037


TrainOutput(global_step=140, training_loss=0.2951631477900914, metrics={'train_runtime': 717.4823, 'train_samples_per_second': 22.735, 'train_steps_per_second': 0.357, 'total_flos': 2348003058032640.0, 'train_loss': 0.2951631477900914, 'epoch': 2.1882352941176473})

# generate predictions


In [21]:
free_vars(
    vars_to_delete=["tokenized_dataset", "splits"], namespace=globals(), logger=logger
)

2025-08-03 10:55:59 | reddit_moderation | INFO | Deleted variable 'tokenized_dataset'
2025-08-03 10:55:59 | reddit_moderation | INFO | Deleted variable 'splits'
2025-08-03 10:55:59 | reddit_moderation | INFO | GPU memory freed: 0.00 MB
2025-08-03 10:55:59 | reddit_moderation | INFO | RAM memory freed: 0.00 MB
RAM freed: 0.00 MB (2569.43 -> 2569.43)
GPU allocated freed: 0.00 MB (1509.00 -> 1509.00)
GPU reserved freed: 0.00 MB (1882.00 -> 1882.00)


In [22]:
test = pd.read_csv(os.path.join(INPUT_PATH, "jigsaw-agile-community-rules", "test.csv"))
for c in [
    "body",
    "rule",
    "subreddit",
    "positive_example_1",
    "positive_example_2",
    "negative_example_1",
    "negative_example_2",
]:
    logger.info(f"Cleaning {c = }")
    test[c] = test[c].apply(sanitize_comment)
test = test.rename(columns={"body": "comment"})
test["prompt"] = test.apply(_make_prompt, axis=1)
test = Dataset.from_pandas(test)
tokenized_test = test.map(preprocess_, batched=True, batch_size=128)
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask"])
# Generate predictions
logger.info("Starting model predictions on test data")
logger.debug(f"Using trainer with model: {type(trainer.model).__name__}")
predictions = trainer.predict(tokenized_test).predictions
logger.info(f"Predictions generated: shape {predictions.shape}")
logger.debug(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")

# Convert to probabilities
logger.debug("Converting logits to probabilities using softmax")
probs = softmax(predictions, axis=1)[:, 1]  # Get violation probabilities
logger.info(f"Probabilities computed: {len(probs)} samples")
logger.debug(f"Probability range: [{probs.min():.4f}, {probs.max():.4f}]")
logger.debug(f"Mean probability: {probs.mean():.4f}")

# Create submission file
logger.info("Creating submission DataFrame")
sub = pd.DataFrame({"row_id": test["row_id"], "rule_violation": probs})
logger.info(f"Submission DataFrame created: {len(sub)} rows")
logger.debug(f"Row ID range: {sub['row_id'].min()} to {sub['row_id'].max()}")

# Save submission
logger.info("Saving submission to CSV file")
sub.to_csv("submission.csv", index=False)
logger.info("Submission file 'submission.csv' saved successfully")

# Display sample results
logger.info("Displaying sample submission results:")
sample_results = sub.head()
for idx, row in sample_results.iterrows():
    logger.info(
        f"  Row ID {row['row_id']}: Violation probability = {row['rule_violation']:.4f}"
    )

logger.info("Test data processing and prediction pipeline completed successfully")

# Optional: Log summary statistics
logger.info("=== PREDICTION SUMMARY ===")
logger.info(f"Total test samples processed: {len(sub)}")
logger.info(f"High confidence violations (>0.8): {(probs > 0.8).sum()}")
logger.info(
    f"Medium confidence violations (0.5-0.8): {((probs > 0.5) & (probs <= 0.8)).sum()}"
)
logger.info(f"Low violations (<0.5): {(probs <= 0.5).sum()}")
logger.info("=== END SUMMARY ===")

sub.head()

2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'body'
2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'rule'
2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'subreddit'
2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'positive_example_1'
2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'positive_example_2'
2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'negative_example_1'
2025-08-03 10:56:00 | reddit_moderation | INFO | Cleaning c = 'negative_example_2'


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

2025-08-03 10:56:00 | reddit_moderation | INFO | Starting model predictions on test data


2025-08-03 10:56:00 | reddit_moderation | INFO | Predictions generated: shape (10, 2)
2025-08-03 10:56:00 | reddit_moderation | INFO | Probabilities computed: 10 samples
2025-08-03 10:56:00 | reddit_moderation | INFO | Creating submission DataFrame
2025-08-03 10:56:00 | reddit_moderation | INFO | Submission DataFrame created: 10 rows
2025-08-03 10:56:00 | reddit_moderation | INFO | Saving submission to CSV file
2025-08-03 10:56:00 | reddit_moderation | INFO | Submission file 'submission.csv' saved successfully
2025-08-03 10:56:00 | reddit_moderation | INFO | Displaying sample submission results:
2025-08-03 10:56:00 | reddit_moderation | INFO |   Row ID 2029.0: Violation probability = 0.0150
2025-08-03 10:56:00 | reddit_moderation | INFO |   Row ID 2030.0: Violation probability = 0.0306
2025-08-03 10:56:00 | reddit_moderation | INFO |   Row ID 2031.0: Violation probability = 0.9949
2025-08-03 10:56:00 | reddit_moderation | INFO |   Row ID 2032.0: Violation probability = 0.9953
2025-08-0

Unnamed: 0,row_id,rule_violation
0,2029,0.014959
1,2030,0.03059
2,2031,0.994935
3,2032,0.99527
4,2033,0.996466
