In [None]:
%pip install --upgrade --force-reinstall transformers==4.52.4

In [None]:
%pip install -q transformers datasets scikit-learn pandas accelerate

In [None]:
from transformers import TrainingArguments

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report

In [None]:
from google.colab import files
uploaded = files.upload()  # select my_trained_model.zip from your local machine


In [None]:
import zipfile
with zipfile.ZipFile("my_trained_model.zip","r") as z:
    z.extractall("my_trained_model")


In [None]:
model_dir = "my_trained_model"

In [None]:
import os
import json
import ast
import numpy as np
import torch
from transformers import AutoTokenizer, BertForSequenceClassification

# ------------------------
# Configuration

# ------------------------
# Helpers
# ------------------------
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def decode_preds(pred_vector, label_names):
    if label_names is None:
        return None
    return [label for label, p in zip(label_names, pred_vector) if p == 1]

def load_model_and_assets(model_dir):
    # Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = BertForSequenceClassification.from_pretrained(
        model_dir,
        problem_type="multi_label_classification"
    )
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Load thresholds
    try:
        with open(os.path.join(model_dir, "best_thresholds.json")) as f:
            thresholds = np.array(json.load(f))
    except FileNotFoundError:
        print("‚ö†Ô∏è best_thresholds.json not found. Using default 0.5.")
        thresholds = None

    # Load label names
    try:
        with open(os.path.join(model_dir, "label_names.json"), encoding="utf-8") as f:
            label_names = json.load(f)
    except FileNotFoundError:
        print("‚ö†Ô∏è label_names.json not found.")
        label_names = None

    return tokenizer, model, thresholds, label_names, device

def predict(question, answer, tokenizer, model, device, threshold_array=None):
    enc = tokenizer(
        text=question,
        text_pair=answer,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
        logits = outputs.logits.squeeze(0).cpu().numpy()
        probs = sigmoid(logits)

        if threshold_array is None:
            preds = (probs >= 0.5).astype(int)
        else:
            if len(threshold_array) != probs.shape[0]:
                raise ValueError(f"Threshold length {len(threshold_array)} != number of labels {probs.shape[0]}")
            preds = (probs >= threshold_array).astype(int)

    return probs, preds

In [None]:
with open(os.path.join(model_dir, "label_names.json"), encoding="utf-8") as f:
    label_names = json.load(f)

In [None]:
# ------------------------
# Example Usage
# ------------------------
tokenizer, model, best_thresholds, label_names, device = load_model_and_assets(model_dir)

question = "Woran machen Sie Ihre Bewertung fest?"
answer = "Teil einer starken Gemeinschaft - in einer innovativen aber bodenst√§ndigen Bank"

probs, preds = predict(question, answer, tokenizer, model, device, threshold_array=best_thresholds)
print("üì¶ Binary Prediction Vector:\n", preds)

if label_names:
    decoded = decode_preds(preds, label_names)
    print("\nüè∑Ô∏è Decoded Labels:")
    for label in decoded:
        print(" -", label)
else:
    print("‚ö†Ô∏è No label names available for decoding.")


In [None]:
df=pd.read_excel("Answers_survey_sampled.xlsx", header=0)

In [None]:
df.head()

In [None]:
import pandas as pd
import ast
import re

def clean_listish(v):
    # real list/tuple ‚Üí join
    if isinstance(v, (list, tuple, set)):
        return ", ".join(map(str, v))
    # string that might look like a list ‚Üí try to parse
    if isinstance(v, str):
        s = v.strip()
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, (list, tuple, set)):
                return ", ".join(map(str, parsed))
        except Exception:
            pass
        # fallback: strip brackets & quotes from the string
        return re.sub(r"[\[\]\"']", "", s).strip()
    # anything else ‚Üí string
    return str(v)

df["Categories"] = df["Categories"].apply(clean_listish)


In [None]:
df.head()

In [None]:
import pandas as pd

# assume df has columns: "frage", "answer"
# and your model, tokenizer, thresholds etc. are already loaded

def classify_row(row):
    # run prediction on one row
    probs, preds = predict(
        row["Frage"],
        row["Antwort"],
        tokenizer,
        model,
        device,
        threshold_array=best_thresholds
    )
    # decode predictions into labels
    if label_names:
        decoded = decode_preds(preds, label_names)
        return decoded  # list of labels (you can join later if you want)
    else:
        return None


In [None]:
bad = ~df["Antwort"].apply(lambda x: isinstance(x, str) and x is not None)
# print(df.loc[bad, ["row_id","answer"]].head())

In [None]:
df["Antwort"] = df["Antwort"].fillna("").astype(str)

In [None]:
# Apply to every row
df["categoriesString_pred"] = df.apply(classify_row, axis=1)

In [None]:
df

In [None]:
df.to_excel("test.xlsx", index=False)
files.download("test.xlsx")