In [None]:
%pip install --upgrade --force-reinstall transformers==4.52.4

In [None]:
%pip install -q transformers datasets scikit-learn pandas accelerate

In [None]:
from transformers import TrainingArguments

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report

In [None]:
from google.colab import files
uploaded = files.upload()  # select my_trained_model.zip from your local machine


In [None]:
import zipfile
with zipfile.ZipFile("my_trained_model.zip","r") as z:
    z.extractall("my_trained_model")


In [None]:
model_dir = "my_trained_model"

In [None]:
import os
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def load_model_and_assets(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # thresholds
    thresholds = None
    thresholds_path = os.path.join(model_dir, "best_thresholds.json")
    if os.path.exists(thresholds_path):
        with open(thresholds_path, "r", encoding="utf-8") as f:
            thresholds = np.array(json.load(f), dtype=float)
        if thresholds.shape[0] != model.config.num_labels:
            raise ValueError(
                f"Threshold length {thresholds.shape[0]} != num_labels {model.config.num_labels}"
            )
    else:
        print("‚ö†Ô∏è best_thresholds.json not found. Using default 0.5.")

    # label names
    label_names = None
    labels_path = os.path.join(model_dir, "label_names.json")
    if os.path.exists(labels_path):
        with open(labels_path, "r", encoding="utf-8") as f:
            label_names = json.load(f)
        if len(label_names) != model.config.num_labels:
            raise ValueError(
                f"label_names length {len(label_names)} != num_labels {model.config.num_labels}"
            )
    else:
        print("‚ö†Ô∏è label_names.json not found.")

    return tokenizer, model, thresholds, label_names, device

def predict(question, answer, tokenizer, model, device, threshold_array=None, max_length=512):
    enc = tokenizer(
        text=question,
        text_pair=answer,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits.squeeze(0).detach().cpu().numpy()
        probs = sigmoid(logits)

    if threshold_array is None:
        preds = (probs >= 0.5).astype(int)
    else:
        preds = (probs >= threshold_array).astype(int)

    return probs, preds

def decode_preds(pred_vector, label_names):
    if label_names is None:
        return None
    return [label for label, p in zip(label_names, pred_vector) if p == 1]


In [None]:
import os

model_dir = "./my_trained_model"  # <-- the folder after unzipping

tokenizer, model, best_thresholds, label_names, device = load_model_and_assets(model_dir)

question = "Vielleicht haben Sie ja eine konkrete Idee, wie?"
answer = "Nein"

probs, preds = predict(
    question,
    answer,
    tokenizer,
    model,
    device,
    threshold_array=best_thresholds
)

print("üì¶ Binary Prediction Vector:\n", preds)
print("üìà Probabilities:\n", probs)

if label_names:
    decoded = decode_preds(preds, label_names)
    print("\nüè∑Ô∏è Decoded Labels:")
    for label in decoded:
        print(" -", label)
else:
    print("‚ö†Ô∏è No label names available for decoding.")


In [None]:
import numpy as np

probs_np = np.array(probs)
thr_np   = np.array(best_thresholds)

# If batch shape [1, L], flatten to [L]
if probs_np.ndim == 2:
    probs_np = probs_np[0]

max_idx  = int(np.argmax(probs_np))
max_prob = float(probs_np[max_idx])
max_thr  = float(thr_np[max_idx])

label = label_names[max_idx] if label_names else str(max_idx)

print(f"üèÜ Top label: {label}")
print(f"üìà Prob:      {max_prob:.4f}")
print(f"üéöÔ∏è Threshold: {max_thr:.4f}")
print(f"‚úÖ Passes?:   {max_prob >= max_thr}")


In [None]:
df=pd.read_csv("df_test (1).csv", header=0)

In [None]:
df.head()

In [None]:
import pandas as pd
import ast
import re

def clean_listish(v):
    # real list/tuple ‚Üí join
    if isinstance(v, (list, tuple, set)):
        return ", ".join(map(str, v))
    # string that might look like a list ‚Üí try to parse
    if isinstance(v, str):
        s = v.strip()
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, (list, tuple, set)):
                return ", ".join(map(str, parsed))
        except Exception:
            pass
        # fallback: strip brackets & quotes from the string
        return re.sub(r"[\[\]\"']", "", s).strip()
    # anything else ‚Üí string
    return str(v)

df["Categories"] = df["labels"].apply(clean_listish)


In [None]:
df.head()

In [None]:
# keep only these two columns (in this order)
df = df[['questionText', 'questionAnswer', 'Categories']]

In [None]:
import numpy as np

thr = np.array(best_thresholds, dtype=float)
thr[label_names.index("nein")] = 0.9   # example override


In [None]:
import pandas as pd

# assume df has columns: "frage", "answer"
# and your model, tokenizer, thresholds etc. are already loaded

def classify_row(row):
    # run prediction on one row
    probs, preds = predict(
        row["questionText"],
        row["questionAnswer"],
        tokenizer,
        model,
        device,
        threshold_array=best_thresholds
    )
    # decode predictions into labels
    if label_names:
        decoded = decode_preds(preds, label_names)
        return decoded  # list of labels (you can join later if you want)
    else:
        return None


In [None]:
# Apply to every row
df["categoriesString_pred"] = df.apply(classify_row, axis=1)

In [None]:
df

In [None]:
df.to_excel("test.xlsx", index=False)
files.download("test.xlsx")

In [None]:
# import os
# import numpy as np

# model_dir = "./my_trained_model"

# tokenizer, model, best_thresholds, label_names, device = load_model_and_assets(model_dir)

# # modify thresholds
# thr = np.array(best_thresholds, dtype=float)
# target_label = "nein"
# label_idx = label_names.index(target_label)
# thr[label_idx] =0.9355555555555555


# question = "Vielleicht haben Sie ja eine konkrete Idee, wie?"
# answer = "Nein"

# probs, preds = predict(
#     question, answer,
#     tokenizer, model, device,
#     threshold_array=thr  # ‚úÖ use modified thresholds
# )

# print("üì¶ Binary Prediction Vector:\n", preds)
# print("üìà Probabilities:\n", probs)

# if label_names:
#     decoded = decode_preds(preds, label_names)
#     print("\nüè∑Ô∏è Decoded Labels:")
#     for label in decoded:
#         print(" -", label)


In [None]:
# import pandas as pd

# # assume df has columns: "frage", "answer"
# # and your model, tokenizer, thresholds etc. are already loaded

# def classify_row(row):
#     # run prediction on one row
#     probs, preds = predict(
#         row["questionText"],
#         row["questionAnswer"],
#         tokenizer,
#         model,
#         device,
#         threshold_array=thr
#     )
#     # decode predictions into labels
#     if label_names:
#         decoded = decode_preds(preds, label_names)
#         return decoded  # list of labels (you can join later if you want)
#     else:
#         return None
