# 📝 Jigsaw Toxic Comment Classification (Colab Setup)

This notebook sets up environment, config, dataset split, and class imbalance handling.

In [None]:
# ============================
# 🔹 Colab Startup for Jigsaw Toxic Comment Project
# ============================

# 1. Install required libraries
!pip install -U transformers scikit-learn pandas tqdm

# 2. Import essentials
import os
from dataclasses import dataclass, replace
from typing import Tuple
from google.colab import drive

# 3. Mount Google Drive (for persistent storage)
drive.mount('/content/drive')

# 4. Set up directories
DATA_DIR = "/content/data"                                      # dataset goes here
CKPT_DIR = "/content/drive/MyDrive/jigsaw_checkpoints"          # checkpoints saved here
ARTIFACTS_DIR = "/content/drive/MyDrive/jigsaw_artifacts"       # reports/metrics here

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
os.makedirs(os.path.join(ARTIFACTS_DIR, "errors"), exist_ok=True)

print("✅ Directories ready!")
print("Data:", DATA_DIR)
print("Checkpoints:", CKPT_DIR)
print("Artifacts:", ARTIFACTS_DIR)

# 5. Full CFG dataclass
@dataclass
class CFG:
    # 🔹 Model & training
    model_name: str = "roberta-base"
    max_length: int = 256
    train_epochs: int = 5
    train_batch_size: int = 16
    eval_batch_size: int = 32
    lr: float = 2e-5
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    grad_accum_steps: int = 1
    amp: bool = True

    # 🔹 Paths
    data_dir: str = "./data"
    ckpt_dir: str = "./checkpoints"
    artifacts_dir: str = "./artifacts"
    data_file: str = "data.csv"   # 👈 name of your uploaded dataset

    # 🔹 Reproducibility
    seed: int = 42

    # 🔹 Labels (multi-label classification targets)
    labels: Tuple[str, ...] = (
        "toxic",
        "severe_toxicity",
        "obscene",
        "threat",
        "insult",
        "identity_attack",
        "sexual_explicit",
    )

    # 🔹 Validation
    val_size: float = 0.1
    stratify_on_any: bool = True

# 6. Instantiate config and update with Colab paths
cfg = CFG()
cfg = replace(cfg, data_dir=DATA_DIR, ckpt_dir=CKPT_DIR, artifacts_dir=ARTIFACTS_DIR)

print("✅ Config ready!")
print(cfg)


In [None]:
# ============================
# 🔹 Data Load & Sanity Checks
# ============================

import pandas as pd

# Ensure dataset exists in /content/data
path = os.path.join(cfg.data_dir, cfg.data_file)
assert os.path.exists(path), f"Dataset not found at {path}"

df = pd.read_csv(path)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist()[:12])
df.head()


In [None]:
# ============================
# 🔹 Train/Val Split with Stratification
# ============================

from sklearn.model_selection import train_test_split

# any_toxic flag
df["any_toxic"] = (df[list(cfg.labels)].sum(axis=1) > 0).astype(int)

stratify_vec = df["any_toxic"] if cfg.stratify_on_any else None

train_df, val_df = train_test_split(
    df,
    test_size=cfg.val_size,
    random_state=cfg.seed,
    stratify=stratify_vec,
)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# prevalence helper
def _prevalence(table, labels):
    n = len(table)
    stats = {col: float(table[col].sum())/max(n,1) for col in labels}
    stats["any_toxic"] = float(table["any_toxic"].sum())/max(n,1)
    return stats

train_prev = _prevalence(train_df, cfg.labels)
val_prev   = _prevalence(val_df, cfg.labels)

prev_df = pd.DataFrame([
    {"split": "train", **train_prev},
    {"split": "val",   **val_prev},
])
prev_path = os.path.join(cfg.artifacts_dir, "split_prevalence.csv")
prev_df.to_csv(prev_path, index=False)

print(f"Split sizes -> train: {len(train_df)} | val: {len(val_df)}")
print(f"Saved label prevalence to {prev_path}")
prev_df


In [None]:
# ============================
# 🔹 Step 2: Class imbalance handling (pos_weight)
# ============================

# Ensure binary labels
for _col in cfg.labels:
    if train_df[_col].dtype != int and train_df[_col].dtype != bool:
        train_df[_col] = (train_df[_col].astype(float) >= 0.5).astype(int)
    if val_df[_col].dtype != int and val_df[_col].dtype != bool:
        val_df[_col] = (val_df[_col].astype(float) >= 0.5).astype(int)

N = len(train_df)
pos_counts = {c: int(train_df[c].sum()) for c in cfg.labels}
neg_counts = {c: int(N - pos_counts[c]) for c in cfg.labels}

_eps = 1e-6
pos_weight = {c: (neg_counts[c] / (pos_counts[c] + _eps)) for c in cfg.labels}

posw_df = pd.DataFrame([
    {"label": c, "train_pos": pos_counts[c], "train_neg": neg_counts[c], "pos_weight": pos_weight[c]}
    for c in cfg.labels
])
posw_path = os.path.join(cfg.artifacts_dir, "pos_weight.csv")
posw_df.to_csv(posw_path, index=False)
print(f"Saved pos_weight table to {posw_path}")

import torch
pos_weight_tensor = torch.tensor([pos_weight[c] for c in cfg.labels], dtype=torch.float32)
print("pos_weight_tensor:", pos_weight_tensor)

posw_df
