<h2>Vorbereitung der Umgebung</h2>

In [4]:
!pip install rouge-score rouge sacrebleu nltk

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0m

In [7]:
import sys
import torch
from torch.utils.data import DataLoader
import json
from rouge import Rouge
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
sys.path.append('..')  # Gehe eine Ebene höher
from net_lstm import LSTM_seq
from config import num_classes, hidden_size, batch_size, epochs, learning_rate
import logging

# Sicherstellen, dass die relevanten Dateien vorhanden sind
model_path = "trained_lstm_model.pth"
test_data_path = "processed_test_data.pt"
test_vocab_path = "test_vocab.json"
train_vocab_path = "train_vocab.json"

# Prüfen, ob CUDA verfügbar ist
device = torch.device("cuda:1")
print(f"Using device: {device}")


Using device: cuda:1


<h2>Laden der Daten und Modelle</h2>

In [2]:
# Modell initialisieren
model = LSTM_seq(max_seq=52, input_size=10, hidden_size=512, class_num=1814).to(device)

# Modell laden
model.load_state_dict(torch.load("trained_lstm_model.pth"))
model.eval()  # Setze das Modell in den Evaluierungsmodus

print("Modell erfolgreich geladen.")

Modell erfolgreich geladen.


  model.load_state_dict(torch.load("trained_lstm_model.pth"))


<h2>Testdaten laden</h2>

In [1]:
# Testdaten laden
test_data = torch.load(test_data_path)

# Outputs und Labels extrahieren
test_outputs = test_data["outputs"].to(device)  # Eingabe (Video-Features)
test_labels = test_data["labels"].to(device)  # Zielsequenzen

print(f"Testdaten geladen: Outputs Shape: {test_outputs.shape}, Labels Shape: {test_labels.shape}")


NameError: name 'torch' is not defined

<h2>Validierung</h2>

In [5]:
# Initialisiere Variablen für die Accuracy
total_tokens = 0
correct_tokens = 0

# Berechnung der Accuracy
with torch.no_grad():
    for i in range(test_outputs.size(0)):
        # Hole die Eingaben und Labels
        video_input = test_outputs[i].unsqueeze(0).to(device)  # Eingabedaten auf cuda:1
        label = test_labels[i].unsqueeze(0).to(device)  # Labels auf cuda:1

        # Modellvorhersage
        output = model(video_input, label)
        prediction = torch.argmax(output, dim=2).squeeze(0).cpu().numpy()
        ground_truth = label.squeeze(0).cpu().numpy()

        # Korrekte Tokens zählen
        for pred, true in zip(prediction, ground_truth):
            if pred == true:
                correct_tokens += 1
            total_tokens += 1

# Berechne die Gesamtaccuracy
validation_accuracy = correct_tokens / total_tokens
print(f"Validation Accuracy: {validation_accuracy:.4f}")


Validation Accuracy: 0.6048


In [11]:
# Logging für sacrebleu auf ERROR setzen
logging.getLogger("sacrebleu").setLevel(logging.ERROR)

# Initialisiere ROUGE und BLEU Scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bleu = BLEU()
smooth = SmoothingFunction().method1

# Erstelle DataLoader für Testdaten
test_dataset = torch.utils.data.TensorDataset(test_outputs, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Lade das Vokabular (falls nötig)
with open(test_vocab_path, "r") as f:
    test_vocab = json.load(f)
    
# Konvertiere Vokabular in ein Wörterbuch, falls es eine Liste ist
if isinstance(test_vocab, list):
    test_vocab = {idx: word for idx, word in enumerate(test_vocab)}

# Dekodierfunktion
def decode_sequence(sequence, vocab):
    # Vokabular umkehren: ID -> Wort
    inv_vocab = {v: k for k, v in vocab.items()}
    return " ".join([inv_vocab.get(token, "<UNK>") for token in sequence])

# Wahre Labels und Vorhersagen sammeln
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_features, batch_labels in test_loader:
        # Daten auf das richtige Gerät übertragen
        batch_features = batch_features.to(device)
        batch_labels = batch_labels.to(device)

        # Vorhersagen generieren
        output = model(batch_features, batch_labels)
        predictions = output.argmax(dim=-1).cpu().numpy()  # Vorhersage

        # Labels und Vorhersagen sammeln
        all_predictions.extend(predictions)
        all_labels.extend(batch_labels.cpu().numpy())

# ROUGE- und BLEU-Bewertung
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
bleu_scores = []
bleu_1_scores = []
bleu_2_scores = []
bleu_3_scores = []
bleu_4_scores = []

for label, prediction in zip(all_labels, all_predictions):
    # Dekodieren
    label_text = decode_sequence(label, test_vocab)
    prediction_text = decode_sequence(prediction, test_vocab)

    # ROUGE-Bewertung
    rouge_result = rouge.score(label_text, prediction_text)
    for metric in rouge_scores.keys():
        rouge_scores[metric].append(rouge_result[metric].fmeasure)

    # BLEU-Bewertung (SacreBLEU und nltk)
    bleu_scores.append(bleu.sentence_score(prediction_text, [label_text]).score)

    label_tokens = label_text.split()
    prediction_tokens = prediction_text.split()
    bleu_1 = sentence_bleu([label_tokens], prediction_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth)
    bleu_2 = sentence_bleu([label_tokens], prediction_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
    bleu_3 = sentence_bleu([label_tokens], prediction_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth)
    bleu_4 = sentence_bleu([label_tokens], prediction_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

    bleu_1_scores.append(bleu_1)
    bleu_2_scores.append(bleu_2)
    bleu_3_scores.append(bleu_3)
    bleu_4_scores.append(bleu_4)

# Durchschnittswerte berechnen
average_rouge = {metric: sum(scores) / len(scores) for metric, scores in rouge_scores.items()}
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_bleu_1 = sum(bleu_1_scores) / len(bleu_1_scores)
average_bleu_2 = sum(bleu_2_scores) / len(bleu_2_scores)
average_bleu_3 = sum(bleu_3_scores) / len(bleu_3_scores)
average_bleu_4 = sum(bleu_4_scores) / len(bleu_4_scores)

# Ergebnisse ausgeben
print("ROUGE Scores:")
print(f"ROUGE-1: {average_rouge['rouge1']:.4f}")
print(f"ROUGE-2: {average_rouge['rouge2']:.4f}")
print(f"ROUGE-L: {average_rouge['rougeL']:.4f}")
print(f"BLEU Score (SacreBLEU): {average_bleu:.4f}")
print("BLEU Scores (nltk):")
print(f"BLEU-1: {average_bleu_1:.4f}")
print(f"BLEU-2: {average_bleu_2:.4f}")
print(f"BLEU-3: {average_bleu_3:.4f}")
print(f"BLEU-4: {average_bleu_4:.4f}")

ROUGE Scores:
ROUGE-1: 0.7317
ROUGE-2: 0.7250
ROUGE-L: 0.7317
BLEU Score (SacreBLEU): 57.2785
BLEU Scores (nltk):
BLEU-1: 0.5769
BLEU-2: 0.5728
BLEU-3: 0.5717
BLEU-4: 0.5641
