In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from data.preprocessing import DataProvider
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

import torch
import nltk




In [3]:
data_provider = DataProvider()
X_train, X_test, y_train, y_test = data_provider.get_raw_datasets()

In [4]:
# Tokenizer laden und Tokenisieren der Texte (Train und Test)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# Dataset Klasse definieren damit die Huggingface API arbeiten kann
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
# Datasets für Training und Test
train_dataset = FakeNewsDataset(train_encodings, y_train.tolist())
test_dataset = FakeNewsDataset(test_encodings, y_test.tolist())

In [7]:
# BERT-Modell laden (Vortrainiertes Modell)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Metriken berechnen um zu evaluieren
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [9]:
# Trainingsargumente
training_args = TrainingArguments(
    output_dir="./results",                # Wo das Modell gespeichert wird
    learning_rate=2e-5,                    # Lernrate
    per_device_train_batch_size=16,        # Batch-Größe für das Training
    per_device_eval_batch_size=16,         # Batch-Größe für die Evaluierung
    num_train_epochs=2,                    # Anzahl der Epochen beim Training
    report_to="none",                      # kein Weights & Biases reporting
)

# Trainer definieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [10]:
# Modell trainieren
trainer.train()


# Modell evaluieren
trainer.evaluate()

  0%|          | 0/952 [00:00<?, ?it/s]

{'loss': 0.51, 'grad_norm': 7.3450727462768555, 'learning_rate': 9.49579831932773e-06, 'epoch': 1.05}
{'train_runtime': 540.4505, 'train_samples_per_second': 28.162, 'train_steps_per_second': 1.761, 'train_loss': 0.45829577405913535, 'epoch': 2.0}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.4384528398513794,
 'eval_accuracy': 0.7950604308985811,
 'eval_precision': 0.8337988826815642,
 'eval_recall': 0.8870728083209509,
 'eval_f1': 0.8596112311015118,
 'eval_runtime': 20.1073,
 'eval_samples_per_second': 94.642,
 'eval_steps_per_second': 5.918,
 'epoch': 2.0}

In [None]:
y_pred=trainer.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))