In [None]:
%pip install --upgrade --force-reinstall transformers==4.52.4

In [None]:
%pip install -q transformers datasets scikit-learn pandas accelerate

In [None]:
from transformers import TrainingArguments

In [None]:
args = TrainingArguments(output_dir="./results")
# print(args)


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report

In [None]:
!pip install openpyxl


In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
df=pd.read_csv("faza_1_training_dataset_sa_comma.csv")

In [None]:
df


In [None]:
df=df[["questionText", "questionAnswer","category"]]

In [None]:
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)
df['labels'] = df['category'].apply(lambda x: [c.strip() for c in x.split(',')])

In [None]:
# Drop rows where the 'text' column is NaN
df = df.dropna(subset=["questionText", "questionAnswer", "labels"])

# Optionally, reset the index after dropping rows (if needed)
df = df.reset_index(drop=True)

In [None]:
import numpy as np

df["questionAnswer"] = df["questionAnswer"].replace("#NAME?", np.nan)

df = df.dropna(subset=["questionAnswer"]).copy()
df = df[df["questionAnswer"].astype(str).str.strip().ne("")].copy()

df = df.reset_index(drop=True)

In [None]:
df=df[["questionText", "questionAnswer","labels"]]

In [None]:
!pip install iterative-stratification


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

df = df.reset_index(drop=True)

# (optional but recommended) normalize answer so Nein/nein match
df["questionAnswer_norm"] = (
    df["questionAnswer"]
      .astype(str)
      .str.strip()
      .str.replace(r"\s+", " ", regex=True)
      .str.lower()
)

# 1) Create group id for each unique (questionText, normalized answer)
df["pair_id"] = pd.util.hash_pandas_object(
    df[["questionText", "questionAnswer_norm"]],
    index=False
).astype("int64")

group_col = "pair_id"

# 2) Build ONE label-list per group (union of labels inside the group)
grouped = (
    df.groupby(group_col)["labels"]
      .apply(lambda s: sorted(set(l for labs in s for l in labs)))
      .reset_index()
)

groups = grouped[group_col].values
group_labels_list = grouped["labels"]

# 3) Multi-hot for stratifying groups
mlb_groups = MultiLabelBinarizer()
Y_groups = mlb_groups.fit_transform(group_labels_list)

# --- Split groups: trainval vs test (10%) ---
msss1 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
trainval_g_idx, test_g_idx = next(msss1.split(np.zeros(len(groups)), Y_groups))

trainval_groups = set(groups[trainval_g_idx])
test_groups     = set(groups[test_g_idx])

df_trainval = df[df[group_col].isin(trainval_groups)].copy()
df_test     = df[df[group_col].isin(test_groups)].copy()

# --- Split trainval groups: train vs val (val = 10% total => 0.111111 of trainval) ---
grouped_trainval = grouped[grouped[group_col].isin(trainval_groups)].reset_index(drop=True)
groups_tv = grouped_trainval[group_col].values
labels_tv = grouped_trainval["labels"]

mlb_tv = MultiLabelBinarizer()
Y_tv = mlb_tv.fit_transform(labels_tv)

msss2 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.111111, random_state=42)
train_g_idx, val_g_idx = next(msss2.split(np.zeros(len(groups_tv)), Y_tv))

train_groups = set(groups_tv[train_g_idx])
val_groups   = set(groups_tv[val_g_idx])

df_train = df_trainval[df_trainval[group_col].isin(train_groups)].copy()
df_val   = df_trainval[df_trainval[group_col].isin(val_groups)].copy()

# --- 4) Build X ---
X_train = df_train[["questionText", "questionAnswer"]]
X_val   = df_val[["questionText", "questionAnswer"]]
X_test  = df_test[["questionText", "questionAnswer"]]

# --- 5) Fit MultiLabelBinarizer on TRAIN ONLY (clean) ---
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(df_train["labels"])
y_val   = mlb.transform(df_val["labels"])
y_test  = mlb.transform(df_test["labels"])

# Optional: y as DataFrames
y_train = pd.DataFrame(y_train, columns=mlb.classes_, index=df_train.index)
y_val   = pd.DataFrame(y_val,   columns=mlb.classes_, index=df_val.index)
y_test  = pd.DataFrame(y_test,  columns=mlb.classes_, index=df_test.index)

print(len(df_train), len(df_val), len(df_test))
print("Num labels:", len(mlb.classes_))
print("Labels missing from train:", set().union(*df["labels"]) - set().union(*df_train["labels"]))


In [None]:
# 4. Tokenization (Option B: German model)
from transformers import AutoTokenizer
import torch

# checkpoint = "deepset/gbert-base"   # German BERT (recommended for German data)
checkpoint = "deepset/gbert-large"   # German BERT (recommended for German data)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_texts(text_a, text_b, labels):
    encodings = tokenizer(
        text=text_a.tolist(),
        text_pair=text_b.tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    encodings["labels"] = torch.tensor(labels, dtype=torch.float32)
    return encodings

# Train encodings
train_encodings = tokenize_texts(
    X_train["questionText"],
    X_train["questionAnswer"],
    y_train.values
)

# Validation encodings
val_encodings = tokenize_texts(
    X_val["questionText"],
    X_val["questionAnswer"],
    y_val.values
)

# Test encodings
test_encodings = tokenize_texts(
    X_test["questionText"],
    X_test["questionAnswer"],
    y_test.values
)


In [None]:
class SurveyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = SurveyDataset(train_encodings)
val_dataset = SurveyDataset(val_encodings)
test_dataset = SurveyDataset(test_encodings)

In [None]:
from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained(
#     "deepset/gbert-base",
#     num_labels=y_train.shape[1],
#     problem_type="multi_label_classification"
# )
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "deepset/gbert-large",
    num_labels=y_train.shape[1],
    problem_type="multi_label_classification"
)


In [None]:
# Freeze everything, then unfreeze top 4 encoder layers + pooler + classifier
# Works with BertForSequenceClassification / AutoModelForSequenceClassification (BERT-style)

def freeze_all_then_unfreeze_top_k_bert(model, k=12):
    # 1) freeze all params
    for p in model.parameters():
        p.requires_grad = False

    # 2) unfreeze top-k encoder layers (BERT-base has 12 layers: 0..11)
    top_layers = list(range(12 - k, 12))  # e.g. k=4 -> [8,9,10,11]

    for name, p in model.named_parameters():
        # encoder layers
        if any(f"encoder.layer.{i}." in name for i in top_layers):
            p.requires_grad = True

        # optional: pooler (small, often helps)
        if "pooler" in name:
            p.requires_grad = True

        # classifier head
        if "classifier" in name:
            p.requires_grad = True

    # 3) print trainable params
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Unfroze top {k} layers + pooler + classifier")
    print(f"Trainable params: {trainable:,} / {total:,} ({trainable/total:.2%})")

# --- use it ---
freeze_all_then_unfreeze_top_k_bert(model, k=12)


In [None]:
import numpy as np
import torch

# y_train: DataFrame or numpy array with shape (n_samples, n_labels), values {0,1}
Y = y_train.values if hasattr(y_train, "values") else y_train

pos = Y.sum(axis=0)                 # positives per label
neg = Y.shape[0] - pos              # negatives per label

# avoid division by zero (shouldn't happen if every label exists in train)
pos_weight = neg / np.clip(pos, 1, None)

pos_weight_t = torch.tensor(pos_weight, dtype=torch.float32)
print("pos_weight shape:", pos_weight_t.shape)  # (num_labels,)
print("pos_weight min/median/max:", pos_weight_t.min().item(), np.median(pos_weight), pos_weight_t.max().item())


In [None]:
from transformers import Trainer
import torch
import torch.nn as nn

class WeightedMultilabelTrainer(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight  # torch tensor shape (num_labels,)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()  # ensure float32
        outputs = model(**inputs)
        logits = outputs.logits

        # Make sure pos_weight is on same device as logits
        if self.pos_weight is not None:
            loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight.to(logits.device))
        else:
            loss_fct = nn.BCEWithLogitsLoss()

        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred, threshold=0.5, force_top1=False):
    logits, y_true = eval_pred  # shapes: (N, C)
    probs = sigmoid(logits)

    # threshold-based multi-label prediction
    y_pred = (probs >= threshold).astype(int)

    # Optional: guarantee at least one label per sample (useful for your goal)
    if force_top1:
        empty = (y_pred.sum(axis=1) == 0)
        if np.any(empty):
            top1 = probs[empty].argmax(axis=1)
            y_pred[empty] = 0
            y_pred[empty, top1] = 1

    # --- Coverage / emptiness metrics ---
    pred_counts = y_pred.sum(axis=1)                    # how many labels predicted per sample
    coverage = np.mean(pred_counts > 0)                # fraction with at least 1 label
    avg_labels = float(np.mean(pred_counts))           # label cardinality (avg predicted labels per sample)


    return {
        "f1_micro": f1_score(y_true, y_pred, average="micro", zero_division=0),
        "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "precision_micro": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "recall_micro": recall_score(y_true, y_pred, average="micro", zero_division=0),

        # new: usefulness/behavior metrics
        "coverage_at_least_one": float(coverage),
        "avg_pred_labels": avg_labels,
    }


In [None]:
from transformers import TrainingArguments, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./output_model",

    eval_strategy="epoch",         # your transformers version uses this
    save_strategy="epoch",

    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=0.01,

    per_device_train_batch_size=16,   # H100 can handle this usually
    per_device_eval_batch_size=32,
    num_train_epochs=30,               # early stopping will stop earlier if needed

    bf16=True,                        # âœ… best on H100
    dataloader_pin_memory=True,       # fine on GPU
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,

    report_to=[],
)


trainer = WeightedMultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # fixed threshold metrics during training
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    pos_weight=pos_weight_t,          # âœ… your computed pos_weight tensor
)

In [None]:
trainer.train()

In [None]:
print("Best checkpoint:", trainer.state.best_model_checkpoint)
print("Best metric:", trainer.state.best_metric)


In [None]:
from transformers.trainer_utils import get_last_checkpoint
print("Last checkpoint:", get_last_checkpoint("./output_model"))


In [None]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def find_best_thresholds(logits, y_true, thresholds=np.arange(0.1, 1.00, 0.01)):
    probs = sigmoid(logits)
    num_labels = y_true.shape[1]
    best_thresholds = np.full(num_labels, 0.5, dtype=float)

    for i in range(num_labels):
        best_f1, best_t = -1.0, 0.5
        yt = y_true[:, i]
        pr = probs[:, i]
        for t in thresholds:
            pred_i = (pr >= t).astype(int)
            f1 = f1_score(yt, pred_i, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        best_thresholds[i] = best_t

    return best_thresholds

def eval_with_thresholds(logits, y_true, thresholds):
    probs = sigmoid(logits)
    y_pred = (probs >= thresholds).astype(int)
    # --- Coverage / emptiness metrics ---
    pred_counts = y_pred.sum(axis=1)                    # how many labels predicted per sample
    coverage = np.mean(pred_counts > 0)                # fraction with at least 1 label
    avg_labels = float(np.mean(pred_counts))           # label cardinality (avg predicted labels per sample)
    return {
        "f1_micro": f1_score(y_true, y_pred, average="micro", zero_division=0),
        "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "precision_micro": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "recall_micro": recall_score(y_true, y_pred, average="micro", zero_division=0),
                # new: usefulness/behavior metrics
        "coverage_at_least_one": float(coverage),
        "avg_pred_labels": avg_labels,
    }

# Tune thresholds on VAL once
val_pred = trainer.predict(val_dataset)
best_thresholds = find_best_thresholds(val_pred.predictions, val_pred.label_ids)

# Evaluate TEST with frozen thresholds
test_pred = trainer.predict(test_dataset)
metrics_test = eval_with_thresholds(test_pred.predictions, test_pred.label_ids, best_thresholds)
print(metrics_test)


In [None]:
import os
import json
import shutil
import numpy as np

# ===== Where to save =====
local_dir = "./my_trained_model"
os.makedirs(local_dir, exist_ok=True)

# ===== 0) Recover the true label order used in training =====
# (Because you created y_train as a DataFrame with columns=mlb.classes_)
label_names = list(y_train.columns)

# Optional safety checks
assert trainer.model.config.num_labels == len(label_names), (
    f"Mismatch: model has {trainer.model.config.num_labels} labels, "
    f"but y_train has {len(label_names)} columns"
)
assert len(best_thresholds) == len(label_names), (
    f"Mismatch: best_thresholds has {len(best_thresholds)} values, "
    f"but there are {len(label_names)} labels"
)

# ===== 1) Write label mapping into config BEFORE saving =====
trainer.model.config.id2label = {i: n for i, n in enumerate(label_names)}
trainer.model.config.label2id = {n: i for i, n in enumerate(label_names)}
trainer.model.config.problem_type = "multi_label_classification"

# ===== 2) Save best model + tokenizer + training args =====
trainer.save_model(local_dir)
tokenizer.save_pretrained(local_dir)

with open(os.path.join(local_dir, "training_args.json"), "w", encoding="utf-8") as f:
    f.write(trainer.args.to_json_string())

# ===== 3) Save label names (now guaranteed correct & in the right order) =====
with open(os.path.join(local_dir, "label_names.json"), "w", encoding="utf-8") as f:
    json.dump(label_names, f, indent=2, ensure_ascii=False)

# ===== 4) Save thresholds =====
best_thresholds_list = np.asarray(best_thresholds, dtype=float).tolist()
with open(os.path.join(local_dir, "best_thresholds.json"), "w", encoding="utf-8") as f:
    json.dump(best_thresholds_list, f, indent=2)

# ===== 5) Save metadata =====
metadata = {
    "metric_for_best_model": trainer.args.metric_for_best_model,
    "best_metric": trainer.state.best_metric,
    "global_step": trainer.state.global_step,
    "num_train_epochs": trainer.state.epoch,
}
with open(os.path.join(local_dir, "metadata.json"), "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

with open(os.path.join(local_dir, "log_history.json"), "w", encoding="utf-8") as f:
    json.dump(trainer.state.log_history, f, indent=2)

# ===== 6) Zip + download =====
zip_path = shutil.make_archive("my_trained_model", "zip", local_dir)
print(f"ðŸ“¦ Zipped to: {zip_path}")

try:
    from google.colab import files
    files.download(zip_path)
except Exception:
    print("âœ… Not running in Colab â€” zip saved locally.")


In [None]:
# true_avg = test_pred.label_ids.sum(axis=1).mean()
# pred_avg = (sigmoid(test_pred.predictions) >= best_thresholds).sum(axis=1).mean()
# print(true_avg, pred_avg)


In [None]:
df_test.to_csv("df_test.csv", index=False)

from google.colab import files
files.download("df_test.csv")


In [None]:
# trainer.train(resume_from_checkpoint=True)
# print("Best checkpoint:", trainer.state.best_model_checkpoint)
# print("Best metric:", trainer.state.best_metric)
