In [None]:
from data.preprocessing import DataProvider
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_scheduler
from sklearn.metrics import classification_report



In [6]:
# Daten laden
data_provider = DataProvider()
X_train, X_test, y_train, y_test = data_provider.get_raw_datasets()

# Konvertiere die Daten in das gewünschte Format
train_texts = X_train.tolist()  # Konvertiere den DataFrame/die Matrix in eine Liste
test_texts = X_test.tolist()
train_labels = y_train.tolist()  # Labels ebenfalls in eine Liste umwandeln
test_labels = y_test.tolist()


In [9]:


# T5-Modell und Tokenizer laden
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [None]:
#labels in ein string konvertieren
train_labels = [str(label) for label in train_labels]
test_labels = [str(label) for label in test_labels]


In [15]:
# Tokenisiere Trainingsdaten
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
train_labels_encodings = tokenizer(train_labels, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Tokenisiere Testdaten
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
test_labels_encodings = tokenizer(test_labels, padding=True, truncation=True, max_length=512, return_tensors="pt")



### Dataset und Dataloader erstellen

In [18]:



class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

# Erstelle PyTorch-Datensätze
train_dataset = TextDataset(train_encodings, train_labels_encodings["input_ids"])
test_dataset = TextDataset(test_encodings, test_labels_encodings["input_ids"])

# DataLoader erstellen
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


### Modell trainieren

In [None]:

# Optimizer und Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # 3 Epochen
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Trainingsschleife
model.train()
for epoch in range(3):  # 3 Epochen
    for batch in train_loader:
        inputs = batch[0]
        labels = batch[1]

        # Verschiebe Daten auf die GPU, falls verfügbar
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        model.to(device)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        # Modellvorhersage und Berechnung des Loss
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        print(f"Loss: {loss.item()}")

        # Backpropagation
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Loss: 7.667296886444092
Loss: 10.372357368469238
Loss: 12.366928100585938
Loss: 10.611980438232422
Loss: 8.646943092346191
Loss: 8.789233207702637
Loss: 8.153525352478027
Loss: 8.583930015563965
Loss: 9.595768928527832
Loss: 7.809519290924072
Loss: 8.453583717346191
Loss: 7.542263031005859
Loss: 5.710042953491211
Loss: 6.328287601470947
Loss: 7.207559108734131
Loss: 7.572610855102539
Loss: 5.343601226806641
Loss: 6.514411449432373
Loss: 5.954237461090088
Loss: 6.084249496459961
Loss: 5.821352481842041
Loss: 4.871910572052002
Loss: 5.467621326446533
Loss: 4.761642932891846
Loss: 4.555118083953857
Loss: 3.9497289657592773
Loss: 5.3971686363220215
Loss: 3.5799639225006104
Loss: 3.761028289794922
Loss: 3.1095077991485596
Loss: 4.118257522583008
Loss: 4.060482501983643
Loss: 3.238619089126587
Loss: 3.0637104511260986
Loss: 3.109847068786621
Loss: 2.453474760055542
Loss: 2.1683883666992188
Loss: 2.375025987625122
Loss: 2.0120222568511963
Loss: 2.7995853424072266
Loss: 2.6961286067962646
Loss

### Modell evaluieren

In [24]:

model.eval()  # Modell in den Evaluierungsmodus setzen
predictions = []
references = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = batch[1].to(device)

        outputs = model.generate(**inputs)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        predictions.extend(preds)
        references.extend(refs)




In [28]:

print(classification_report(references, predictions))


              precision    recall  f1-score   support

                   0.00      0.00      0.00         0
           0       0.63      0.31      0.41       557
           1       0.76      0.92      0.84      1346

    accuracy                           0.74      1903
   macro avg       0.46      0.41      0.42      1903
weighted avg       0.72      0.74      0.71      1903



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
