In [None]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

def remove_character_from_column(
    df: pd.DataFrame,
    columname: str,
    character_to_remove: str,
    new_character: str = "",
    is_regex: bool = True
) -> pd.DataFrame:
    """
    Replace occurrences of pattern/character in a DataFrame column.
    """
    if columname not in df.columns:
        raise ValueError(f"Column '{columname}' not found in DataFrame. Available: {list(df.columns)}")

    df = df.copy()
   
    s = df[columname].fillna("").astype(str)

    if is_regex:
        df[columname] = s.str.replace(character_to_remove, new_character, regex=True)
    else:
        df[columname] = s.str.replace(character_to_remove, new_character, regex=False)

    return df


def replace_characters(df_to_replace, columname='text', character_patterns=None, is_regex=True):
    """
    La tua funzione, identica, ma in versione standalone.
    """
    if character_patterns is None:
        character_patterns = [
            (r'[\u2000-\u200B\u3000\xa0]', ' '),       # Unicode spaces
            (r'[\u200C\u200D\u2060\uFEFF]', ''),       # Invisible characters
            (r'[\u201C\u201D\u2018\u2019]', '"'),      # Quotation marks
            (r'[\u2022\u2043]', ''),                   # List symbols
            (r'[\U0001F600-\U0001F64F]', ''),          # Emoji
            (r'[\u2026]', '...'),                      # Ellipsis
            (r"http\S+|www.\S+", ""),                  # URL
            (r"[\n\t\r]", "")                          # Tabulation and newline characters
        ]
    else:
        if not isinstance(character_patterns, list):
            raise TypeError("character_patterns must be a list of tuples")

    for pattern, replacement in character_patterns:
        df_to_replace = remove_character_from_column(
            df_to_replace,
            columname=columname,
            character_to_remove=pattern,
            new_character=replacement,
            is_regex=is_regex
        )

    return df_to_replace


def write_classes_to_file(classes, out_dir: str, filename: str = "classes.txt"):
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, filename)
    with open(path, "w", encoding="utf-8") as f:
        for c in classes:
            f.write(f"{c}\n")
    return path

def read_classes_from_file(out_dir: str, filename: str = "classes.txt"):
    path = os.path.join(out_dir, filename)
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]



def get_and_preprocess_mnds(
    dim: int | None = None,
    cache_dir: str = "./data_mnds",
):
    """
    Simile al tuo:
      - usa caching su JSONL
      - usa ML Binarizer su category_level_1 + category_level_2
      - pulisce il testo con replace_characters (la tua)
      - restituisce tf.data.Dataset((text, multi_hot)) + classes_names
    """
    os.makedirs(cache_dir, exist_ok=True)

    suffix = f"_{dim}" if dim is not None else ""
    dataset_path = os.path.join(cache_dir, f"mnds{suffix}.jsonl")
    raw_csv_path = os.path.join(cache_dir, f"mnds{suffix}.csv")
    classes_path = os.path.join(cache_dir, "classes.txt")


    if os.path.exists(dataset_path) and os.path.exists(classes_path):
        df_final = pd.read_json(dataset_path, orient="records", lines=True)
        df_final_text = df_final["text"].astype(str).values
        df_final["category"] = df_final["category"].apply(lambda x: np.array(x, dtype=np.int32))
        df_final_categories = np.array(df_final["category"].tolist(), dtype=np.int32)

        dataset = tf.data.Dataset.from_tensor_slices((df_final_text, df_final_categories))
        classes_names = read_classes_from_file(cache_dir)
        return dataset, classes_names


    split = f"train[:{dim}]" if dim is not None else "train"
    mlb = MultiLabelBinarizer()


    if os.path.exists(raw_csv_path):
        df = pd.read_csv(raw_csv_path)
    else:
        ds = load_dataset("textminr/mn-ds", split=split)
        ds.to_csv(raw_csv_path, index=False)
        df = pd.DataFrame(ds)


    if "content" in df.columns and "text" not in df.columns:
        df.rename(columns={"content": "text"}, inplace=True)


    needed = ["text", "category_level_1", "category_level_2"]
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"Colonne mancanti: {missing}. Disponibili: {list(df.columns)}")

    df = df[needed].copy()
    df["category"] = None  


    df_final = replace_characters(df, columname="text")


    category = (df_final[["category", "category_level_1", "category_level_2"]].fillna("").values.tolist())
    category = [[el for el in row if str(el).strip() != ""] for row in category]

    category_encoded = mlb.fit_transform(category).astype(np.int32)

    df_final["category"] = list(category_encoded)
    df_final = df_final[["text", "category"]].copy()

    df_final_text = df_final["text"].astype(str).values
    df_final_categories = np.array(df_final["category"].tolist(), dtype=np.int32)

    dataset = tf.data.Dataset.from_tensor_slices((df_final_text, df_final_categories))


    write_classes_to_file(mlb.classes_, cache_dir, filename="classes.txt")
    df_final.to_json(dataset_path, orient="records", lines=True, force_ascii=False)

    classes_names = list(mlb.classes_)
    return dataset, classes_names


# Split 

def split_dataset(dataset, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42):
    if not isinstance(dataset, (pd.DataFrame, tf.data.Dataset)):
        raise TypeError("dataset must be either a Pandas DataFrame or a TensorFlow dataset")
    if not isinstance(test_size, float) or not isinstance(val_size, float):
        raise TypeError("test_size and val_size must be floats")
    if not isinstance(random_state, int):
        raise TypeError("random_state must be an integer")
    if test_size < 0.0 or test_size > 1.0:
        raise ValueError("test_size must be between 0.0 and 1.0")
    if val_size < 0.0 or val_size > 1.0:
        raise ValueError("val_size must be between 0.0 and 1.0")
    if random_state < 0:
        raise ValueError("random_state must be a positive integer")
    if test_size + val_size >= 1.0:
        raise ValueError("test_size + val_size must be < 1.0")

    if isinstance(dataset, pd.DataFrame):
        return split_df_dataset(dataset, random_state, test_size, val_size)
    else:
        return split_tf_dataset(dataset, test_size, val_size, random_state)


def split_df_dataset(dataset, random_state, test_size, val_size):
    df_train_val, df_test = train_test_split(
        dataset, test_size=test_size, random_state=random_state, shuffle=True
    )
    val_relative = val_size / (1.0 - test_size)
    df_train, df_val = train_test_split(
        df_train_val, test_size=val_relative, random_state=random_state, shuffle=True
    )
    return df_train, df_val, df_test


def split_tf_dataset(dataset: tf.data.Dataset, test_size=0.1, val_size=0.1, random_seed=42, buffer_size=10000):
    dataset = dataset.shuffle(buffer_size=buffer_size, seed=random_seed, reshuffle_each_iteration=False)

    dataset_size = int(dataset.reduce(0, lambda x, _: x + 1).numpy())

    test_n = int(test_size * dataset_size)
    val_n = int(val_size * dataset_size)
    train_n = dataset_size - test_n - val_n

    train_dataset = dataset.take(train_n)
    rest_dataset = dataset.skip(train_n)

    val_dataset = rest_dataset.take(val_n)
    test_dataset = rest_dataset.skip(val_n)

    return train_dataset, val_dataset, test_dataset



if __name__ == "__main__":
    dataset, classes = get_and_preprocess_mnds(dim=2000, cache_dir="./data_mnds")
    print("labels:", len(classes), classes[:10])

    train_ds, val_ds, test_ds = split_dataset(dataset, test_size=0.1, val_size=0.1)
    print(train_ds, val_ds, test_ds)

In [None]:
import os
import glob

cache_dir = "./data_mnds"


files = glob.glob(os.path.join(cache_dir, "mnds*.jsonl"))
files += glob.glob(os.path.join(cache_dir, "classes.txt"))

for f in files:
    print("Elimino:", f)
    os.remove(f)

print("Cache pulita.")

In [None]:
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

In [None]:
import torch

print(torch.__version__)
print("CUDA disponibile:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

In [None]:
import numpy as np
import torch
import transformers
import sklearn

print("numpy:", np.__version__)
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("sklearn:", sklearn.__version__)
print("cuda available:", torch.cuda.is_available())


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)
from sklearn.metrics import (
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Torch:", torch.__version__)
print("Device:", device)
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

In [None]:
dataset, classes_names = get_and_preprocess_mnds(
    dim=None,
    cache_dir="./data_mnds"
)

NUM_LABELS = len(classes_names)
print("NUM_LABELS:", NUM_LABELS)

train_tf, val_tf, test_tf = split_dataset(
    dataset,
    test_size=0.1,
    val_size=0.1,
    random_state=42
)

def tfds_to_numpy(ds, limit=None):
    texts, labels = [], []
    for i, (t, y) in enumerate(ds):
        if limit is not None and i >= limit:
            break
        texts.append(t.numpy().decode("utf-8"))
        labels.append(y.numpy().astype(np.float32))
    return texts, np.stack(labels)

train_texts, train_y = tfds_to_numpy(train_tf)
val_texts,   val_y   = tfds_to_numpy(val_tf)
test_texts,  test_y  = tfds_to_numpy(test_tf)

print("sizes:", len(train_texts), len(val_texts), len(test_texts))
print("label shape:", train_y.shape)

In [None]:
import numpy as np
import torch

pos = train_y.sum(axis=0)
neg = train_y.shape[0] - pos

pos_weight_base = (neg / (pos + 1e-8)).astype(np.float32)

ALPHA = 0.5  
pos_weight = ALPHA * pos_weight_base

pos_weight_t = torch.tensor(pos_weight, device=device)

print("pos_weight stats:",
      pos_weight.min(),
      pos_weight.mean(),
      pos_weight.max())

In [None]:
import torch.nn as nn
from transformers import Trainer

class WeightedTrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
    
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 8
LEARNING_RATE = 1e-5
EPOCHS = 2

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_all(texts, name):
    print(f"Tokenizing {name} ({len(texts)}) ...")
    enc = tokenizer(
        texts,
        truncation=True,
        padding=False,
        max_length=MAX_LEN,
    )
    print(f"Done {name}")
    return enc

train_enc = tokenize_all(train_texts, "train")
val_enc   = tokenize_all(val_texts, "val")
test_enc  = tokenize_all(test_texts, "test")

In [None]:
class EncodedMultiLabelDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            k: torch.tensor(v[idx], dtype=torch.long)
            for k, v in self.encodings.items()
        }
        item["labels"] = torch.tensor(
            self.labels[idx], dtype=torch.float32
        )
        return item

train_ds = EncodedMultiLabelDataset(train_enc, train_y)
val_ds   = EncodedMultiLabelDataset(val_enc, val_y)
test_ds  = EncodedMultiLabelDataset(test_enc, test_y)

collator = DataCollatorWithPadding(tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
)

In [None]:
THRESHOLD = 0.105

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def macro_auc_per_label(y_true, y_score, kind="roc"):
    aucs = []
    for j in range(y_true.shape[1]):
        yt = y_true[:, j]
        ys = y_score[:, j]
        if np.unique(yt).size < 2:
            continue
        try:
            if kind == "roc":
                aucs.append(roc_auc_score(yt, ys))
            else:
                aucs.append(average_precision_score(yt, ys))
        except Exception:
            continue
    return float(np.mean(aucs)) if aucs else float("nan")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)

    preds = (probs >= THRESHOLD).astype(int)
    y_true = labels.astype(int)

    return {
        "precision": precision_score(
            y_true, preds, average="micro", zero_division=0
        ),
        "recall": recall_score(
            y_true, preds, average="micro", zero_division=0
        ),
        "binary_accuracy": accuracy_score(
            y_true.reshape(-1),
            preds.reshape(-1),
        ),
        "auc_roc_macro": macro_auc_per_label(
            y_true, probs, kind="roc"
        ),
        "auc_pr_macro": macro_auc_per_label(
            y_true, probs, kind="pr"
        ),
    }

In [None]:
from transformers import EarlyStoppingCallback

args = TrainingArguments(
    output_dir="./bert_mnds_pt",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,               
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="auc_pr_macro",  
    greater_is_better=True,
    fp16=(device.type == "cuda"),
    dataloader_pin_memory=(device.type == "cuda"),
    report_to="none",
    save_total_limit=1,
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator,
    compute_metrics=compute_metrics,
    pos_weight=pos_weight_t,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], 
)

In [None]:
trainer.train()

print("\n--- TEST ---")
trainer.evaluate(test_ds)

SAVE_DIR = "./bert_mnds_pt/best_model"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"Model saved in {SAVE_DIR}")


def predict_multilabel(texts, top_k=10):
    model.eval()
    model.to(device)

    enc = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=MAX_LEN,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        probs = torch.sigmoid(
            model(**enc).logits
        ).cpu().numpy()

    results = []
    for p in probs:
        idx_sorted = np.argsort(-p)[:top_k]
        picked = [
            (classes_names[i], float(p[i]))
            for i in idx_sorted
            if p[i] >= THRESHOLD
        ]
        results.append(picked)
    return results


predict_multilabel([
    "Breaking news: a major earthquake caused damage and emergency response teams were deployed."
])

In [None]:
from sklearn.metrics import precision_score, recall_score
import numpy as np

pred = trainer.predict(val_ds)
probs = 1/(1+np.exp(-pred.predictions))
y_true = pred.label_ids.astype(int)

best_t, best_p = None, 0.0

for t in np.linspace(0.05, 0.5, 100):
    y_pred = (probs >= t).astype(int)
    p = precision_score(y_true.reshape(-1), y_pred.reshape(-1), zero_division=0)
    r = recall_score(y_true.reshape(-1), y_pred.reshape(-1), zero_division=0)

    if r >= 0.6 and p > best_p:
        best_p = p
        best_t = t

print("Best threshold:", best_t)
print("Precision:", best_p)

In [None]:
THRESHOLD = float(best_t)

print("VAL:", trainer.evaluate(val_ds))
print("TEST:", trainer.evaluate(test_ds))

In [None]:
pred = trainer.predict(val_ds)
probs = 1/(1+np.exp(-pred.predictions))
print(probs.min(), probs.mean(), probs.max())

In [None]:
pred = trainer.predict(val_ds)
probs = 1/(1+np.exp(-pred.predictions))
preds = (probs >= THRESHOLD).astype(int)

print("true avg positives:", val_y.sum(axis=1).mean())   # ~2.0
print("pred avg positives:", preds.sum(axis=1).mean())
print("prob max:", probs.max())

In [None]:
print("true avg positives:", val_y.sum(axis=1).mean())

In [None]:
print("max prob:", probs.max())
print("95 percentile:", np.percentile(probs, 95))

In [None]:
import numpy as np

pred = trainer.predict(val_ds)
probs = 1/(1+np.exp(-pred.predictions))

true_avg = val_y.sum(axis=1).mean()
print("true avg positives:", true_avg)

for t in [0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]:
    preds = (probs >= t).astype(int)
    avg_pos = preds.sum(axis=1).mean()
    print(f"threshold={t:.2f}  pred avg positives={avg_pos:.2f}")