In [None]:
from google.colab import drive
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
)
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt


#EXPERIMENT ONE

drive.mount('/drive')
df = pd.read_csv("/drive/My Drive/dataset_experiment_one.csv")
df["label"] = df["tag"].map({"real_news": "real", "fake_news": "fals"})
df["input_text"] = "clasifică știre: " + df["content"]


train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["input_text"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)


val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

class T5NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.inputs = tokenizer(list(texts), max_length=max_len, truncation=True, padding="max_length", return_tensors="pt")
        self.targets = tokenizer(list(labels), max_length=10, truncation=True, padding="max_length", return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item["labels"] = self.targets["input_ids"][idx]
        item["labels"][item["labels"] == tokenizer.pad_token_id] = -100
        return item

    def __len__(self):
        return len(self.inputs["input_ids"])

train_dataset = T5NewsDataset(train_texts, train_labels, tokenizer)
test_dataset  = T5NewsDataset(test_texts, test_labels, tokenizer)
val_dataset   = T5NewsDataset(val_texts, val_labels, tokenizer)


model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "accuracy": accuracy_score(decoded_labels, decoded_preds),
        "precision": precision_score(decoded_labels, decoded_preds, average="weighted"),
        "recall": recall_score(decoded_labels, decoded_preds, average="weighted"),
        "f1": f1_score(decoded_labels, decoded_preds, average="weighted"),
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="/drive/My Drive/results_t5_experiment_one",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=3e-4,
    weight_decay=0.2,
    logging_dir="/drive/My Drive/logs_t5_experiment_one,
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model="accuracy",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    output = model.generate(**inputs, max_length=10)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip().lower()

y_true = list(test_labels)
y_pred = [classify_news(t) for t in test_texts]
print(y_pred)
y_pred = [pred.lower() for pred in y_pred]


print(classification_report(y_true, y_pred, target_names=["fals", "real"]))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="weighted"))
print("Recall:", recall_score(y_true, y_pred, average="weighted"))
print("F1 Score:", f1_score(y_true, y_pred, average="weighted"))

def plot_confusion(y_true, y_pred, labels=["real", "fals"]):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

plot_confusion(y_true, y_pred)

model.save_pretrained("/drive/My Drive/t5_experiment_one")
tokenizer.save_pretrained("/drive/My Drive/t5_experiment_one")


In [None]:
from google.colab import drive
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
)
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt

#EXPERIMENT TWO

drive.mount('/drive')
train_df = pd.read_csv("/drive/My Drive/dataset_experiment_two_training.csv")
test_df = pd.read_csv("/drive/My Drive/dataset_experiment_two_testing.csv")


train_df["label"] = train_df["tag"].map({"real_news": "real", "fake_news": "fals"})
test_df["label"] = test_df["tag"].map({"real_news": "real", "fake_news": "fals"})

train_df["input_text"] = "clasifică știre: " + train_df["content"]
test_df["input_text"] = "clasifică știre: " + test_df["content"]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["input_text"], train_df["label"], test_size=0.1, stratify=train_df["label"], random_state=42
)

test_texts = test_df["input_text"]
test_labels = test_df["label"]


model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

class T5NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.inputs = tokenizer(list(texts), max_length=max_len, truncation=True, padding="max_length", return_tensors="pt")
        self.targets = tokenizer(list(labels), max_length=10, truncation=True, padding="max_length", return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item["labels"] = self.targets["input_ids"][idx]
        item["labels"][item["labels"] == tokenizer.pad_token_id] = -100  # Ignore pad tokens in loss
        return item

    def __len__(self):
        return len(self.inputs["input_ids"])

train_dataset = T5NewsDataset(train_texts, train_labels, tokenizer)
test_dataset  = T5NewsDataset(test_texts, test_labels, tokenizer)
val_dataset   = T5NewsDataset(val_texts, val_labels, tokenizer)


model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "accuracy": accuracy_score(decoded_labels, decoded_preds),
        "precision": precision_score(decoded_labels, decoded_preds, average="weighted"),
        "recall": recall_score(decoded_labels, decoded_preds, average="weighted"),
        "f1": f1_score(decoded_labels, decoded_preds, average="weighted"),
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="/drive/My Drive/results_t5_experiment_two",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.1,
    logging_dir="/drive/My Drive/logs_t5_experiment_two",
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model="accuracy"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    output = model.generate(**inputs, max_length=10)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip().lower()

y_true = list(test_labels)
y_pred = [classify_news(t) for t in test_texts]
print(y_pred)
y_pred = [pred.lower() for pred in y_pred]

print(classification_report(y_true, y_pred, target_names=["fals", "real"]))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="weighted"))
print("Recall:", recall_score(y_true, y_pred, average="weighted"))
print("F1 Score:", f1_score(y_true, y_pred, average="weighted"))

def plot_confusion(y_true, y_pred, labels=["real", "fals"]):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

plot_confusion(y_true, y_pred)

model.save_pretrained("/drive/My Drive/t5_experiment_two")
tokenizer.save_pretrained("/drive/My Drive/t5_experiment_two")


In [None]:
from google.colab import drive
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
)
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt


#EXPERIMENT THREE

drive.mount('/drive')
df = pd.read_csv("/drive/My Drive/dataset_experiment_three.csv")
df["label"] = df["tag"].map({"real_news": "real", "fake_news": "fals"})
df["input_text"] = "clasifică știre: " + df["content"]


train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["input_text"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)


val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

class T5NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.inputs = tokenizer(list(texts), max_length=max_len, truncation=True, padding="max_length", return_tensors="pt")
        self.targets = tokenizer(list(labels), max_length=10, truncation=True, padding="max_length", return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item["labels"] = self.targets["input_ids"][idx]
        item["labels"][item["labels"] == tokenizer.pad_token_id] = -100
        return item

    def __len__(self):
        return len(self.inputs["input_ids"])

train_dataset = T5NewsDataset(train_texts, train_labels, tokenizer)
test_dataset  = T5NewsDataset(test_texts, test_labels, tokenizer)
val_dataset   = T5NewsDataset(val_texts, val_labels, tokenizer)


model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "accuracy": accuracy_score(decoded_labels, decoded_preds),
        "precision": precision_score(decoded_labels, decoded_preds, average="weighted"),
        "recall": recall_score(decoded_labels, decoded_preds, average="weighted"),
        "f1": f1_score(decoded_labels, decoded_preds, average="weighted"),
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="/drive/My Drive/results_t5_experiment_three",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.0,
    logging_dir="/drive/My Drive/logs_t5_experiment_three",
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model="accuracy"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    output = model.generate(**inputs, max_length=10)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip().lower()

y_true = list(test_labels)
y_pred = [classify_news(t) for t in test_texts]
print(y_pred)
y_pred = [pred.lower() for pred in y_pred]


print(classification_report(y_true, y_pred, target_names=["fals", "real"]))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="weighted"))
print("Recall:", recall_score(y_true, y_pred, average="weighted"))
print("F1 Score:", f1_score(y_true, y_pred, average="weighted"))

def plot_confusion(y_true, y_pred, labels=["real", "fals"]):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

plot_confusion(y_true, y_pred)

model.save_pretrained("/drive/My Drive/t5_experiment_three")
tokenizer.save_pretrained("/drive/My Drive/t5_experiment_three")


In [None]:
from google.colab import drive
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import pandas as pd
import numpy as np
import torch
import seaborn as sns
import matplotlib.pyplot as plt

#EXPERIMENT FOUR
drive.mount('/drive')

df = pd.read_json("hf://datasets/mihalca/FakeRO_updated/combined_balanced.json")
df["label"] = df["tag"].map({
    "real_news": "real",
    "fake_news": "fals",
    "propaganda": "propagandă",
    "satire": "satiră",
    "misinformation": "dezinformare"
})
df["input_text"] = "clasifică știre: " + df["content"]

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["input_text"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

model_name="t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

class T5NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.inputs = tokenizer(list(texts), max_length=max_len, truncation=True, padding="max_length", return_tensors="pt")
        self.targets = tokenizer(list(labels), max_length=10, truncation=True, padding="max_length", return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item["labels"] = self.targets["input_ids"][idx]
        item["labels"][item["labels"] == tokenizer.pad_token_id] = -100
        return item

    def __len__(self):
        return len(self.inputs["input_ids"])

train_dataset = T5NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = T5NewsDataset(val_texts, val_labels, tokenizer)
test_dataset = T5NewsDataset(test_texts, test_labels, tokenizer)

model = T5ForConditionalGeneration.from_pretrained(model_name)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return {
        "accuracy": accuracy_score(decoded_labels, decoded_preds),
        "precision": precision_score(decoded_labels, decoded_preds, average="macro"),
        "recall": recall_score(decoded_labels, decoded_preds, average="macro"),
        "f1": f1_score(decoded_labels, decoded_preds, average="macro"),
    }


training_args = Seq2SeqTrainingArguments(
    output_dir="/drive/My Drive/results_t5_experiment_four",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.1,
    logging_dir="/drive/My Drive/logs_t5_experiment_four",
    load_best_model_at_end=True,
    predict_with_generate=True,
    metric_for_best_model="accuracy",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    output = trainer.model.generate(**inputs, max_length=10)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip().lower()

y_true = list(test_labels)
y_pred = [classify_news(t) for t in test_texts]
y_pred = [pred.lower() for pred in y_pred]

print(y_pred)

print(classification_report(y_true, y_pred, target_names=["real", "fals", "propagandă", "satiră", "dezinformare"]))
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="macro"))
print("Recall:", recall_score(y_true, y_pred, average="macro"))
print("F1 Score:", f1_score(y_true, y_pred, average="macro"))

def plot_confusion(y_true, y_pred, labels=["fals", "dezinformare", "propagandă", "real", "satiră"]):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

plot_confusion(y_true, y_pred)

model.save_pretrained("/drive/My Drive/t5_experiment_four")
tokenizer.save_pretrained("/drive/My Drive/t5_experiment_four")
