In [1]:
from custom_utils import load_and_concatenate_parquet_files
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig
import torch
from datasets import load_dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df  = load_and_concatenate_parquet_files('data/preprocessed_big_training_df')

df = df.rename(columns={'preprocessed_text': 'text'})
df["label_names"] = df["label"].apply(lambda x: "real" if x == 1 else "fake")
display(df)


Unnamed: 0,text,label,label_names
0,donald trump respond mockery fake swedish atta...,1,real
1,tweetwavethis time true pantstweetwave anthony...,1,real
2,rubio prospect trump president worrisome reute...,0,fake
3,trump lifts cyber command status boost cyber d...,0,fake
4,big republican lie economy tear apart minute v...,1,real
...,...,...,...
63116,half briton want stay eu polledinburgh reuters...,0,fake
63117,bill hillary clinton inc sale right pricein sp...,1,real
63118,orlando gunman shoot time autopsy find new yor...,0,fake
63119,lethal gap supreme court handle death penalty ...,0,fake


In [3]:
train,test      = train_test_split(df,test_size=0.3,stratify=df['label'])
test,validation = train_test_split(test,test_size=1/3,stratify=test['label'])
train.shape, test.shape, validation.shape

((44184, 3), (12624, 3), (6313, 3))

In [4]:
dataset = DatasetDict(
    {'train':Dataset.from_pandas(train,preserve_index=False),
     'test':Dataset.from_pandas(test,preserve_index=False),
     'validation': Dataset.from_pandas(validation,preserve_index=False)
     }    
)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 44184
    })
    test: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 12624
    })
    validation: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 6313
    })
})

In [5]:
label2id = {x['label_names']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}
label2id, id2label

({'real': 1, 'fake': 0}, {1: 'real', 0: 'fake'})

In [6]:
from transformers import DebertaV2Tokenizer
# Load dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

In [7]:
model_ckpt = "bert-base-uncased"

In [8]:
def tokenize_and_format(batch):
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)   
    tokens = tokenizer(batch['text'], padding=True, truncation=True)
    # Convert to PyTorch tensors and move to the correct device
    tokens = {key: torch.tensor(val).to(device) for key, val in tokens.items()}
    tokens['labels'] = torch.tensor(batch['label']).to(device)
    return tokens

In [9]:
tokenized_dataset = dataset.map(tokenize_and_format, batched=True)

Map: 100%|██████████| 44184/44184 [00:24<00:00, 1833.17 examples/s]
Map: 100%|██████████| 12624/12624 [00:06<00:00, 2041.42 examples/s]
Map: 100%|██████████| 6313/6313 [00:03<00:00, 1941.68 examples/s]


In [10]:
config = AutoConfig.from_pretrained(model_ckpt, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("/home/nikl/programming/fhdw/knowledge_engineering_ausarbeitung/fakenews_detection/models/roberta-base/checkpoint-6905").to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import torch
from transformers import AutoConfig, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from torch.utils.data import DataLoader

# def collate_fn(batch):
#     return {key: torch.stack([example[key] for example in batch]) for key in batch[0]}
# Assuming `test_set` is your tokenized dataset

print(tokenized_dataset["test"])
# test_loader = DataLoader(tokenized_dataset["test"], batch_size=32, shuffle=False, collate_fn=collate_fn)

def evaluate_model(model, dataset, device, batch_size=32):
    model.eval()
    all_preds = []
    all_labels = []

    # Loop through the dataset in batches
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        
        # Extract inputs and labels
        input_ids = torch.stack([example['input_ids'] for example in batch]).to(device)
        attention_mask = torch.stack([example['attention_mask'] for example in batch]).to(device)
        labels = torch.tensor([example['labels'] for example in batch]).to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)  # Get predicted labels
            
        # Append predictions and labels to lists
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds


# Run evaluation
all_labels, all_preds = evaluate_model(model, tokenized_dataset["test"], device)

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='binary')
recall = recall_score(all_labels, all_preds, average='binary')
f1 = f1_score(all_labels, all_preds, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Optional: Display full classification report
print("\nClassification Report:\n", classification_report(all_labels, all_preds))


Dataset({
    features: ['text', 'label', 'label_names', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 12624
})


TypeError: string indices must be integers

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from custom_utils import load_and_concatenate_parquet_files
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig
import torch

# Define the directory where model results are stored
MODEL_RESULTS_DIR = "models"

# List of model checkpoints used in training
model_checkpoints = [
    "bert-base-uncased",
    "distilbert-base-uncased",
    "roberta-base",
    # "microsoft/deberta-v3-base"
]

# Load the test dataset
df = load_and_concatenate_parquet_files('data/preprocessed_big_training_df')
df = df.rename(columns={'preprocessed_text': 'text'})
df["label_names"] = df["label"].apply(lambda x: "real" if x == 1 else "fake")

# Split the dataset (ensure test split matches training)
_, test = train_test_split(df, test_size=0.3, stratify=df['label'])
_, test = train_test_split(test, test_size=1/3, stratify=test['label'])

test_dataset = Dataset.from_pandas(test, preserve_index=False)

# Function to tokenize the test dataset
def tokenize_and_format(batch, model, device):
    tokenizer = AutoTokenizer.from_pretrained(model) 
    tokens = tokenizer(batch['text'], padding=True, truncation=True)
    # Convert to PyTorch tensors and move to the correct device
    tokens = {key: torch.tensor(val).to(device) for key, val in tokens.items()}
    tokens['labels'] = torch.tensor(batch['label']).to(device)
    return tokens

# Initialize a DataFrame to store model metrics
results_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "F1", "Precision", "Recall", "AUROC", "Train Loss", "Validation Loss"
])

# Function to find the latest checkpoint directory
def get_latest_checkpoint_dir(model_dir):
    checkpoint_dirs = [d for d in os.listdir(model_dir) if d.startswith("checkpoint-")]
    if not checkpoint_dirs:
        return None
    latest_checkpoint = max(checkpoint_dirs, key=lambda x: int(x.split("-")[-1]))
    return os.path.join(model_dir, latest_checkpoint)

# Function to load metrics and evaluate the test dataset
def load_metrics_and_evaluate_test(model_dir, model_ckpt, test_dataset):
    latest_checkpoint = get_latest_checkpoint_dir(model_dir)
    if not latest_checkpoint:
        print(f"No checkpoints found in {model_dir}")
        return None

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    config = AutoConfig.from_pretrained(model_ckpt, num_labels=2)
    base_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)

    peft_config = LoraConfig(
        task_type="SEQ_CLS",
        r=8,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(base_model, peft_config)
    # model.load_state_dict(torch.load(latest_checkpoint+"/rng_state.pth"), strict=False)
    model.load_adapter(os.path.join(latest_checkpoint), config=peft_config, adapter_name="lora")
   
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # Tokenize test dataset
    tokenized_test = test_dataset.map(lambda batch: tokenize_and_format(batch, model_ckpt, model.device), batched=True)
    tokenized_test.set_format("torch")
    tokenized_test = tokenized_test.with_format("torch", columns=["input_ids", "attention_mask", "labels"])

    # Evaluate on the test dataset
    model.eval()
    y_true, y_pred, y_pred_proba = [], [], []
    with torch.no_grad():
        for batch in tokenized_test:
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}
            print(inputs)
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            y_pred.extend(torch.argmax(logits, dim=-1).tolist())
            y_pred_proba.extend(probabilities[:, 1].tolist())
            y_true.extend(batch["labels"].tolist())

    # Compute metrics dynamically
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auroc = roc_auc_score(y_true, y_pred_proba)

    # Load training logs
    log_file = os.path.join(model_dir, "trainer_state.json")
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            trainer_state = json.load(f)
            train_loss = trainer_state["log_history"][-1].get("loss", None)  # Final training loss
            valid_loss = trainer_state["log_history"][-1].get("eval_loss", None)  # Final validation loss
    else:
        train_loss, valid_loss = None, None

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "auroc": auroc,
        "train_loss": train_loss,
        "valid_loss": valid_loss
    }

# Iterate over each model directory and collect metrics
for model_ckpt in model_checkpoints:
    model_dir = os.path.join(MODEL_RESULTS_DIR, model_ckpt.replace("/", "_"))
    metrics = load_metrics_and_evaluate_test(model_dir, model_ckpt, test_dataset)

    if metrics:
        results_df = results_df.append({
            "Model": model_ckpt,
            "Accuracy": metrics["accuracy"],
            "F1": metrics["f1"],
            "Precision": metrics["precision"],
            "Recall": metrics["recall"],
            "AUROC": metrics["auroc"],
            "Train Loss": metrics["train_loss"],
            "Validation Loss": metrics["valid_loss"]
        }, ignore_index=True)

# Save results as a CSV
results_csv_path = os.path.join(MODEL_RESULTS_DIR, "model_comparison.csv")
results_df.to_csv(results_csv_path, index=False)

# Display the results table
print(results_df)

# Visualization of metrics
metrics_to_plot = ["Accuracy", "F1", "Precision", "Recall", "AUROC"]
fig, axs = plt.subplots(1, len(metrics_to_plot), figsize=(20, 5))

for i, metric in enumerate(metrics_to_plot):
    axs[i].bar(results_df["Model"], results_df[metric], color="skyblue")
    axs[i].set_title(metric)
    axs[i].set_xticklabels(results_df["Model"], rotation=45, ha="right")
    axs[i].set_ylabel(metric)

plt.tight_layout()
plt.show()

# Plot loss curves
plt.figure(figsize=(10, 5))
plt.bar(results_df["Model"], results_df["Train Loss"], label="Train Loss", alpha=0.7, color="blue")
plt.bar(results_df["Model"], results_df["Validation Loss"], label="Validation Loss", alpha=0.7, color="orange")
plt.title("Train vs Validation Loss")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Loss")
plt.legend()
plt.show()
