In [None]:
%pip install --upgrade --force-reinstall transformers==4.52.4

In [None]:
%pip install -q transformers datasets scikit-learn pandas accelerate

In [None]:
from transformers import TrainingArguments

In [None]:
berargs = TrainingArguments(output_dir="./results")
# print(args)


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report

In [None]:
!pip install openpyxl

In [None]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("df_combined.csv")
df.head()

In [None]:
y = df.drop(columns=["questionText", "category_type", "answer"])
X=df[["questionText", "answer"]]
#3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
#4. Tokenization
# bert-base-german-cased
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_texts(text_a, text_b, labels):
    encodings = tokenizer(
        text=text_a.tolist(),
        text_pair=text_b.tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    encodings['labels'] = torch.tensor(labels, dtype=torch.float)
    return encodings

train_encodings = tokenize_texts(
    X_train["questionText"],
    X_train["answer"],
    y_train.values
)

test_encodings = tokenize_texts(
    X_test["questionText"],
    X_test["answer"],
    y_test.values
)


In [None]:
class SurveyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = SurveyDataset(train_encodings)
test_dataset = SurveyDataset(test_encodings)

In [None]:
#6. Model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=y.shape[1],
    problem_type="multi_label_classification"
)

In [None]:
from transformers import Trainer
import torch.nn as nn
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

# Threshold grid
thresholds_to_try = np.arange(0.1, 0.9, 0.1)

def compute_metrics(pred):
    logits, true_labels = pred

    # Ensure numpy arrays (avoid creating a torch.Tensor each time)
    if not isinstance(logits, np.ndarray):
        try:
            logits = np.asarray(logits)
        except Exception:
            logits = np.array(logits)  # fallback

    if not isinstance(true_labels, np.ndarray):
        true_labels = np.asarray(true_labels)

    # Sigmoid to get probabilities (vectorized)
    probs = 1 / (1 + np.exp(-logits))  # equivalent to torch.sigmoid

    num_labels = true_labels.shape[1]
    best_thresholds = np.zeros(num_labels, dtype=float)
    final_preds = np.zeros_like(true_labels)

    # Per-label threshold search
    for i in range(num_labels):
        # Broadcast comparisons and metric calculation
        best_f1 = -1.0
        best_thresh = 0.5
        for thresh in thresholds_to_try:
            preds_i = (probs[:, i] > thresh).astype(int)
            f1 = f1_score(true_labels[:, i], preds_i, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
        best_thresholds[i] = best_thresh
        final_preds[:, i] = (probs[:, i] > best_thresh).astype(int)

    # Compute aggregated metrics
    f1_micro = f1_score(true_labels, final_preds, average="micro", zero_division=0)
    f1_macro = f1_score(true_labels, final_preds, average="macro", zero_division=0)
    precision_micro = precision_score(true_labels, final_preds, average="micro", zero_division=0)
    recall_micro = recall_score(true_labels, final_preds, average="micro", zero_division=0)

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        # Convert to plain Python list so Trainer can serialize without error
        "best_thresholds": best_thresholds.tolist(),
    }


In [None]:
#7. Training
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./output_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
import os
import json
import shutil

# === Prepare directory ===
local_dir = "./my_trained_model"
os.makedirs(local_dir, exist_ok=True)

# === 1. Save model, tokenizer, training args ===
trainer.save_model(local_dir)
tokenizer.save_pretrained(local_dir)

with open(os.path.join(local_dir, "training_args.json"), "w") as f:
    f.write(trainer.args.to_json_string())

# === 2. Save label names from y columns ===
label_names = y.columns.tolist()  # <--- Make sure y is a DataFrame
with open(os.path.join(local_dir, "label_names.json"), "w") as f:
    json.dump(label_names, f, indent=2)

threshold_last = None
# Traverse log_history in reverse to find the latest entry with best_thresholds
for log in reversed(trainer.state.log_history):
    if "eval_best_thresholds" in log:
        threshold_last = log["eval_best_thresholds"]
        break

if threshold_last is not None:
    with open(os.path.join(local_dir, "best_thresholds_last_epoch.json"), "w") as f:
        json.dump(threshold_last, f, indent=2)
    print("✅ Saved best_thresholds from the last epoch.")
else:
    print("⚠️ No thresholds found in any epoch.")

with open(os.path.join(local_dir, "log_history.json"), "w") as f:
    json.dump(trainer.state.log_history, f, indent=2)

# === 4. Zip and download ===
shutil.make_archive("my_trained_model", "zip", local_dir)

from google.colab import files
files.download("my_trained_model.zip")
