In [3]:
import torch
from transformers import TrainingArguments
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    EarlyStoppingCallback
)
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

#  Load data
df_texts = pd.read_csv("/kaggle/input/final-training-dish-data/training_dataset_70k_balanced_token.csv")
df_labels = pd.read_csv("/kaggle/input/final-training-dish-data/review_labels.csv")

# Prepare labels
all_labels = sorted(df_labels["label"].unique())
label_to_index = {label: i for i, label in enumerate(all_labels)}
label_map = df_labels.groupby("review_id")["label"].apply(list).to_dict()

texts = []
label_vectors = []

for _, row in df_texts.iterrows():
    review_id = row["review_id"]
    text = row["text"]
    label_vector = np.zeros(len(all_labels))
    for label in label_map.get(review_id, []):
        label_vector[label_to_index[label]] = 1
    texts.append(text)
    label_vectors.append(label_vector)

#  Train/Test Split
# >>>>>>> Wir nehmen hier 10 % der Daten als Testmenge und teilen den Rest als Trainingsdaten auf.
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, label_vectors, test_size=0.1, random_state=42
)

# Tokenize
# >>>>>>> Wir verwenden den Tokenizer von Facebook AI Roberta Base, um die Texte für das Modell vorzubereiten.
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
train_encodings = tokenizer(train_texts, truncation=True, padding="max_length", max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding="max_length", max_length=256)

#  Datasets
train_dataset = MultiLabelDataset(train_encodings, train_labels)
val_dataset = MultiLabelDataset(val_encodings, val_labels)

#  Model
#>>>>>>> Hier erstellen wir das Modell auf Basis von Roberta Base. 
#>>>>>>> Wir geben an, dass es 20 Labels gibt und das Problem eine Multi-Label-Klassifikation ist.
model = AutoModelForSequenceClassification.from_pretrained(
    "FacebookAI/roberta-base",
    num_labels=len(all_labels),
    problem_type="multi_label_classification"
)

#  Metric function
# >>>>>>> Diese Funktion berechnet die Metriken wie Accuracy, F1 usw., um die Ergebnisse auszuwerten.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = np.array(labels)
    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="micro"),
        "recall": recall_score(labels, preds, average="micro"),
    }

#  Training Arguments

#>>>>> Hier definieren wir die Trainingsparameter:
# - speichern alle 1000 Schritte
 
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=2000,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    fp16=True,
    gradient_accumulation_steps=4,
    dataloader_num_workers=4,
    report_to="none",
    save_total_limit=3,
)

#  Trainer
# >>>>>  Wir haben Early Stopping mit Geduld = 3 aktiviert und die Auswertung alle 1000 Schritte eingestellt.
# Das heißt: Alle 1000 Schritte werden die Metriken geprüft.
# Wenn sich die Ergebnisse 3-mal hintereinander nicht verbessern, wird das Training gestoppt.
# In unserem Fall war Schritt 5000 der beste, und das Training endete bei Schritt 8000.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

#  Train
if torch.cuda.is_available():
    print("Training on GPU")
else:
    print("Training on CPU")

trainer.train()

#  Save Model + Tokenizer
model.save_pretrained("/kaggle/working/final_model_70k")
tokenizer.save_pretrained("/kaggle/working/final_model_70k")

2025-07-24 17:55:15.975127: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753379716.325578      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753379716.426885      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training on GPU


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision,Recall
1000,No log,0.177516,0.816419,0.702091,0.865063,0.772955
2000,0.224600,0.153356,0.846765,0.760126,0.845427,0.848108
3000,0.224600,0.146064,0.848694,0.774576,0.86477,0.833204
4000,0.123800,0.143836,0.855235,0.791131,0.857583,0.8529
5000,0.123800,0.145886,0.855948,0.794711,0.853966,0.857938
6000,0.095000,0.149978,0.85523,0.789792,0.855095,0.855366
7000,0.095000,0.155204,0.854162,0.794639,0.844514,0.864034
8000,0.074700,0.156045,0.855384,0.794572,0.849016,0.861849


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

('/kaggle/working/final_model_70k/tokenizer_config.json',
 '/kaggle/working/final_model_70k/special_tokens_map.json',
 '/kaggle/working/final_model_70k/vocab.json',
 '/kaggle/working/final_model_70k/merges.txt',
 '/kaggle/working/final_model_70k/added_tokens.json',
 '/kaggle/working/final_model_70k/tokenizer.json')

In [7]:
# Zip the saved model directory
!zip -r /kaggle/working/checkpoint-5000.zip /kaggle/working/checkpoint-5000


  adding: kaggle/working/checkpoint-5000/ (stored 0%)
  adding: kaggle/working/checkpoint-5000/config.json (deflated 63%)
  adding: kaggle/working/checkpoint-5000/trainer_state.json (deflated 69%)
  adding: kaggle/working/checkpoint-5000/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 10%)
  adding: kaggle/working/checkpoint-5000/optimizer.pt (deflated 23%)
  adding: kaggle/working/checkpoint-5000/scaler.pt (deflated 60%)
  adding: kaggle/working/checkpoint-5000/scheduler.pt (deflated 55%)
  adding: kaggle/working/checkpoint-5000/rng_state.pth (deflated 25%)
  adding: kaggle/working/checkpoint-5000/training_args.bin (deflated 51%)
