# Necessary Imports:


In [2]:
# If running for the first time, install required libraries:
%pip install -q datasets transformers evaluate kagglehub python-dotenv hf_xet 'accelerate>=0.26.0'

# Load .env variables (requires python-dotenv)
import os
from dotenv import load_dotenv

load_dotenv()  # looks for a .env in our project root, which should have the token key
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
if not HF_TOKEN:
    raise ValueError("Please set HUGGINGFACE_TOKEN in your environment or .env file.")

# Core Imports & Helpers:
import zipfile, shutil, warnings
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from datasets import load_dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import evaluate

# Globals
%matplotlib inline
sns.set_style("whitegrid")
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")



Note: you may need to restart the kernel to use updated packages.


# Loading & Pre-processing Data:

In [3]:
# load CoDA once, using the token
dataset = load_dataset("s2w-ai/CoDA", token=HF_TOKEN)
print(dataset)   # train/validation/test splits not yet defined

# convert train split to pandas, save to CSV, reload
df = dataset["train"].to_pandas()
print("Sample rows:\n", df.head())

data_dir = Path("data_coda")
data_dir.mkdir(exist_ok=True)
df.to_csv(data_dir / "train.csv", index=False)
print("Saved train.csv →", data_dir)

# read it back, and use it:
csv_file = next(data_dir.glob("*.csv"))
df_raw = pd.read_csv(csv_file, encoding="latin1")
print(f"Reloaded DataFrame shape: {df_raw.shape}")
df_raw.head(3)


DatasetDict({
    train: Dataset({
        features: ['__key__', '__url__', 'txt'],
        num_rows: 10000
    })
})
Sample rows:
                                              __key__  \
0  coda_dataset/5756-Arms-en-06dd9c0e9b321cb78e64...   
1  coda_dataset/4055-Financial-en-8d378a316398604...   
2  coda_dataset/7429-Financial-en-b41c020ccf19dba...   
3  coda_dataset/583-Others-ru-55aa3ecc54ee4f71c6e...   
4  coda_dataset/7930-Gambling-en-97540ec78721c815...   

                                             __url__  \
0  /Users/blank/.cache/huggingface/hub/datasets--...   
1  /Users/blank/.cache/huggingface/hub/datasets--...   
2  /Users/blank/.cache/huggingface/hub/datasets--...   
3  /Users/blank/.cache/huggingface/hub/datasets--...   
4  /Users/blank/.cache/huggingface/hub/datasets--...   

                                                 txt  
0  search torrents | browse torrents | recent tor...  
1  \n \n \n cc cards / buy western union / best p...  
2  ru\nen\nclub2crd > [en] in

Unnamed: 0,__key__,__url__,txt
0,coda_dataset/5756-Arms-en-06dd9c0e9b321cb78e64...,/Users/blank/.cache/huggingface/hub/datasets--...,search torrents | browse torrents | recent tor...
1,coda_dataset/4055-Financial-en-8d378a316398604...,/Users/blank/.cache/huggingface/hub/datasets--...,\n \n \n cc cards / buy western union / best p...
2,coda_dataset/7429-Financial-en-b41c020ccf19dba...,/Users/blank/.cache/huggingface/hub/datasets--...,ru\nen\nclub2crd > [en] international forum\ns...


In [4]:
# Inline Preprocessing & Label Extraction

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Make a working copy so the original stays intact
df_clean = df_raw.copy()
print(f"Initial rows: {len(df_clean):,}")

# basic text cleaning
df_clean["txt"] = (
    df_clean["txt"]
        .astype(str)                                        # ensure string
        .str.lower()                                        # lowercase
        .str.replace(r'<[^>]+>', ' ', regex=True)           # drop HTML tags
        .str.replace(r'http\S+|www\.\S+', ' ', regex=True)  # remove URLs
        .str.replace(r'[^a-z0-9\s]', ' ', regex=True)       # keep alphanum
        .str.replace(r'\d+', ' ', regex=True)               # remove digits
        .str.replace(r'\s+', ' ', regex=True)               # collapse spaces
        .str.strip()
)

# Token count BEFORE any stop-word filtering
df_clean["tok_before_sw"] = df_clean["txt"].str.count(' ') + 1

empty_after_clean = (df_clean["txt"] == "").sum()
print(f"Rows with empty text after basic cleaning: {empty_after_clean:,}")

# OPTIONAL: stop-word removal 
REMOVE_STOPWORDS = False   # Set it to True if you want to drop them
if REMOVE_STOPWORDS:
    stops = set(ENGLISH_STOP_WORDS)
    df_clean["txt"] = df_clean["txt"].apply(
        lambda s: ' '.join([w for w in s.split() if w not in stops and len(w) > 2])
    )

# Token count AFTER
df_clean["tok_after_sw"] = df_clean["txt"].str.count(' ') + 1

# Drop rows that became empty at any point
pre_drop = len(df_clean)
df_clean = df_clean[df_clean["txt"].str.len() > 0]
print(f"Dropped {pre_drop - len(df_clean):,} empty rows in total.")

# extract label from __key__
df_clean["label"] = df_clean["__key__"].str.split("-", expand=True)[1]

# quick summary statistics
summary = (
    df_clean[["tok_before_sw", "tok_after_sw"]]
    .describe()
    .rename(index={"50%": "median"})
    .round(2)
)
print("\nToken-count summary (before vs after stop-word stage):")
print(summary)

# peek at a few cleaned rows
df_clean[["txt", "label"]].head()


Initial rows: 10,000
Rows with empty text after basic cleaning: 23
Dropped 23 empty rows in total.

Token-count summary (before vs after stop-word stage):
        tok_before_sw  tok_after_sw
count         9977.00       9977.00
mean          1195.56       1195.56
std           2252.40       2252.40
min              1.00          1.00
25%            108.00        108.00
median         381.00        381.00
75%           1059.00       1059.00
max          24258.00      24258.00


Unnamed: 0,txt,label
0,search torrents browse torrents recent torrent...,Arms
1,cc cards buy western union best paypal cards b...,Financial
2,ru en club crd en international forum search a...,Financial
3,av check av check av check ip api av check av ...,Others
4,intel exchange quick dirty read only mirror to...,Gambling


In [5]:
# Split & cast the cleaned data into a Hugging Face DatasetDict
from datasets import Dataset, DatasetDict, ClassLabel

# keep only the fields we need
hf_df = df_clean[["txt", "label"]].reset_index(drop=True)

# convert to a Dataset
full = Dataset.from_pandas(hf_df, preserve_index=False)

# cast labels to a ClassLabel feature
label_names = sorted(full.unique("label"))
full = full.cast_column("label", ClassLabel(names=label_names))

# 60/20/20 stratified split
split1 = full.train_test_split(test_size=0.2, stratify_by_column="label", seed=457)
train_val, test_ds = split1["train"], split1["test"]

split2 = train_val.train_test_split(test_size=0.25, stratify_by_column="label", seed=457)
train_ds, val_ds = split2["train"], split2["test"]

coda = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})
print(coda)


Casting the dataset: 100%|██████████| 9977/9977 [00:00<00:00, 270326.69 examples/s]

DatasetDict({
    train: Dataset({
        features: ['txt', 'label'],
        num_rows: 5985
    })
    validation: Dataset({
        features: ['txt', 'label'],
        num_rows: 1996
    })
    test: Dataset({
        features: ['txt', 'label'],
        num_rows: 1996
    })
})





In [6]:
#Quick Sklearn Baseline

# tiny train/dev slice
small = pd.DataFrame({
    "txt": train_ds["txt"][:2500],
    "label": train_ds["label"][:2500]
})
X_tr, X_dev, y_tr, y_dev = train_test_split(
    small["txt"], small["label"], stratify=small["label"], test_size=0.2, random_state=457
)

# TfidfVectorizer:
from sklearn.feature_extraction.text import TfidfVectorizer
pipe = Pipeline([("tfidf", TfidfVectorizer(max_features=5000)),("clf", LogisticRegression(max_iter=25000))])

pipe.fit(X_tr, y_tr)
print(classification_report(y_dev, pipe.predict(X_dev)))


              precision    recall  f1-score   support

           0       0.96      0.90      0.93        30
           1       0.85      0.72      0.78        39
           2       0.91      0.86      0.88        57
           3       0.94      0.81      0.87        21
           4       0.80      0.82      0.81        50
           5       1.00      0.93      0.96        40
           6       1.00      0.56      0.72        32
           7       0.70      0.96      0.81       142
           8       1.00      0.75      0.85        63
           9       1.00      0.73      0.84        26

    accuracy                           0.84       500
   macro avg       0.92      0.80      0.85       500
weighted avg       0.87      0.84      0.84       500



## Reusable train/eval function
### Using Vanilla Bert as baseline model, to compare to Dark Bert Model along with Roberta

In [7]:
#  Helper: train + test any transformer on the global `coda` data

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np, pandas as pd, random, torch

def compute_metrics_fn(eval_pred):
    """Return accuracy, macro-F1, micro-F1."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_micro": f1_score(labels, preds, average="micro"),
    }

def run_experiment(model_name: str, epochs: int = 3, seed: int = 457):
    """Train & evaluate one model; return dict with eval_* keys."""

    # reproducibility
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

    # tokenise
    tok = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN, model_max_length=512)
    tok_ds = coda.map(lambda ex: tok(ex["txt"], truncation=True, padding="max_length"),
                      batched=True).with_format("torch")

    # model
    mdl = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(label_names), use_auth_token=HF_TOKEN
    )

    #  minimal TrainingArguments (no evaluation_strategy/save_strategy)
    args = TrainingArguments(
        output_dir=f"results/{model_name.replace('/','_')}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=epochs,
        learning_rate=5e-5,
        weight_decay=0.01,
        logging_steps=50,
        report_to="none",
        seed=seed,
    )

    # trainer
    trainer = Trainer(
        model=mdl,
        args=args,
        train_dataset=tok_ds["train"],
        eval_dataset=tok_ds["validation"],
        tokenizer=tok,
        compute_metrics=compute_metrics_fn,
    )

    trainer.train()
    res = trainer.evaluate(tok_ds["test"])   # keys are prefixed with "eval_"
    return {k: round(v, 4) for k, v in res.items()}


### Run BERT, RoBERTa, DarkBERT (optional epoch sweep)

In [9]:

#  Main comparison (3 epochs each)

models = {
    "bert-base-uncased": "BERT-base",
    "roberta-base":      "RoBERTa-base",   # keep or remove as desired
    "s2w-ai/DarkBERT":   "DarkBERT",
}

main_results = {}
for mname, label in models.items():
    print(f"\n Training {label} …")
    main_results[label] = run_experiment(mname, epochs=3)

results_df = (
    pd.DataFrame(main_results).T
      .rename(columns={
          "eval_accuracy": "accuracy",
          "eval_f1_macro": "f1_macro",
          "eval_f1_micro": "f1_micro"
      })[["accuracy", "f1_macro", "f1_micro"]]
)

print("\n=== Test-set metrics (3 epochs) ===")



 Training BERT-base …


Map: 100%|██████████| 5985/5985 [00:06<00:00, 883.67 examples/s]
Map: 100%|██████████| 1996/1996 [00:01<00:00, 1032.60 examples/s]
Map: 100%|██████████| 1996/1996 [00:02<00:00, 849.84 examples/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
#  OPTIONAL: sweep epochs to see how the gap changes

epochs_to_try = [3, 5, 8, 10, 12]
sweep_metrics = ["f1_macro", "accuracy"]   # choose the metrics you care about

def sweep(model_name, label):
    table = {}
    for e in epochs_to_try:
        print(f"\n {label} | {e} epochs")
        res = run_experiment(model_name, epochs=e)
        table[e] = {m: res[f"eval_{m}"] for m in sweep_metrics}
    return pd.DataFrame(table).T

# Run sweeps
bert_sweep  = sweep("bert-base-uncased", "BERT-base")
dark_sweep  = sweep("s2w-ai/DarkBERT",  "DarkBERT")

print("\n=== BERT-base: metric vs epochs ===")
display(bert_sweep)

print("\n=== DarkBERT: metric vs epochs ===")
display(dark_sweep)
