In [None]:
%pip install --upgrade --force-reinstall transformers==4.52.4

In [None]:
%pip install -q transformers datasets scikit-learn pandas accelerate

In [None]:
from transformers import TrainingArguments

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report

In [None]:
from google.colab import files
uploaded = files.upload()  # select my_trained_model.zip from your local machine


In [None]:
import zipfile
with zipfile.ZipFile("my_trained_model_finalni.zip", "r") as z:
    z.extractall("my_trained_model_finalni")


In [None]:
import os
import json
import ast
import numpy as np
import torch
from transformers import AutoTokenizer, BertForSequenceClassification

# ------------------------
# Configuration
# ------------------------
model_dir = "my_trained_model_finalni"

# ------------------------
# Helpers
# ------------------------
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def decode_preds(pred_vector, label_names):
    if label_names is None:
        return None
    return [label for label, p in zip(label_names, pred_vector) if p == 1]

def load_model_and_assets(model_dir):
    # Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = BertForSequenceClassification.from_pretrained(
        model_dir,
        problem_type="multi_label_classification"
    )
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Load thresholds
    try:
        with open(os.path.join(model_dir, "best_thresholds_last_epoch.json")) as f:
            thresholds = np.array(json.load(f))
    except FileNotFoundError:
        print("⚠️ best_thresholds.json not found. Using default 0.5.")
        thresholds = None

    # Load label names
    try:
        with open(os.path.join(model_dir, "label_names.json"), encoding="utf-8") as f:
            label_names = json.load(f)
    except FileNotFoundError:
        print("⚠️ label_names.json not found.")
        label_names = None

    return tokenizer, model, thresholds, label_names, device

def predict(question, answer, tokenizer, model, device, threshold_array=None):
    enc = tokenizer(
        text=question,
        text_pair=answer,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits.squeeze(0).cpu().numpy()
        probs = sigmoid(logits)

        if threshold_array is None:
            preds = (probs >= 0.5).astype(int)
        else:
            if len(threshold_array) != probs.shape[0]:
                raise ValueError(f"Threshold length {len(threshold_array)} != number of labels {probs.shape[0]}")
            preds = (probs >= threshold_array).astype(int)

    return probs, preds


In [None]:
# ------------------------
# Example Usage
# ------------------------
tokenizer, model, best_thresholds, label_names, device = load_model_and_assets(model_dir)

question = "Warum würden Sie den ÖBB Kundenservice weiterempfehlen?"
answer = "die preis ist immer gut"

probs, preds = predict(question, answer, tokenizer, model, device, threshold_array=best_thresholds)
print("📦 Binary Prediction Vector:\n", preds)

if label_names:
    decoded = decode_preds(preds, label_names)
    print("\n🏷️ Decoded Labels:")
    for label in decoded:
        print(" -", label)
else:
    print("⚠️ No label names available for decoding.")


In [None]:
import json

# Replace with the exact filename if needed
filename = "log_history.json"

# Load log
with open(os.path.join(model_dir, filename), encoding="utf-8") as f:
    log_history = json.load(f)
# Print evaluation metrics
print("📊 Evaluation Metrics per Epoch:\n")
for entry in log_history:
    if "eval_loss" in entry:
        print(f"Epoch {entry.get('epoch', '?')} (Step {entry.get('step', '?')}):")
        print(f"  - Eval Loss         : {entry.get('eval_loss'):.4f}")
        print(f"  - F1 Micro          : {entry.get('eval_f1_micro'):.4f}")
        print(f"  - F1 Macro          : {entry.get('eval_f1_macro'):.4f}")
        print(f"  - Precision (Micro) : {entry.get('eval_precision_micro'):.4f}")
        print(f"  - Recall (Micro)    : {entry.get('eval_recall_micro'):.4f}")
        print(f"  - Thresholds (len)  : {len(entry.get('eval_best_thresholds', []))}")
        print("-" * 50)
