In [None]:
from google.colab import drive
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

#EXPERIMENT ONE
drive.mount('/drive')
df = pd.read_csv("/drive/My Drive/dataset_experiment_one.csv")

label_map = {"real_news": 1, "fake_news": 0}
df["label"] = df["tag"].map(label_map)


train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    df["content"], df["label"], test_size=0.1, stratify=df["label"], random_state=42
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.1, stratify=train_val_labels, random_state=42
)

model_name = "racai/distilbert-base-romanian-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=512)

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
test_encodings = tokenize(test_texts)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, 2)
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model on: {device}")

stored_preds = []
stored_labels = []

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits[0] if isinstance(logits, tuple) else logits
    preds = np.argmax(logits, axis=1)

    global stored_preds, stored_labels
    stored_preds = preds
    stored_labels = labels

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted"),
    }

def plot_confusion_matrix(preds, labels, class_names=["fake_news", "real_news"]):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

training_args = TrainingArguments(
    output_dir="/drive/My Drive/results_distilbert_experiment_one",
    logging_dir="/drive/My Drive/logs_distilbert_experiment_one",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

metrics = trainer.evaluate(eval_dataset=test_dataset)
print(metrics)
plot_confusion_matrix(stored_preds, stored_labels)

model.save_pretrained("/drive/My Drive/distilbert_experiment_one")
tokenizer.save_pretrained("/drive/My Drive/distilbert_experiment_one")


In [None]:
from google.colab import drive
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

#EXPERIMENT TWO

drive.mount('/drive')
train_df = pd.read_csv("/drive/My Drive/dataset_experiment_two_training.csv")
test_df  = pd.read_csv("/drive/My Drive/dataset_experiment_two_testing.csv")

label_map = {"real_news": 1, "fake_news": 0}
train_df["label"] = train_df["tag"].map(label_map)
test_df["label"] = test_df["tag"].map(label_map)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["content"], train_df["label"], test_size=0.1, stratify=train_df["label"], random_state=42
)

test_texts = test_df["content"]
test_labels = test_df["label"]

model_name = "racai/distilbert-base-romanian-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=512)

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
test_encodings = tokenize(test_texts)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset   = NewsDataset(val_encodings, val_labels)
test_dataset  = NewsDataset(test_encodings, test_labels)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, 2)
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model on: {device}")

stored_preds = []
stored_labels = []

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits[0] if isinstance(logits, tuple) else logits
    preds = np.argmax(logits, axis=1)

    global stored_preds, stored_labels
    stored_preds = preds
    stored_labels = labels

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted"),
    }

def plot_confusion_matrix(preds, labels, class_names=["fake_news", "real_news"]):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

training_args = TrainingArguments(
    output_dir="/drive/My Drive/results_distilbert_experiment_two",
    logging_dir="/drive/My Drive/logs_distilbert_experiment_two",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

metrics = trainer.evaluate(eval_dataset=test_dataset)
print(metrics)
plot_confusion_matrix(stored_preds, stored_labels)

model.save_pretrained("/drive/My Drive/distilbert_experiment_two")
tokenizer.save_pretrained("/drive/My Drive/distilbert_experiment_two")


In [None]:
from google.colab import drive
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

#EXPERIMENT THREE
drive.mount('/drive')
df = pd.read_csv("/drive/My Drive/preprocessed_dataset_original.csv")

label_map = {"real_news": 1, "fake_news": 0}
df["label"] = df["tag"].map(label_map)


train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    df["content"], df["label"], test_size=0.1, stratify=df["label"], random_state=42
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.1, stratify=train_val_labels, random_state=42
)

model_name = "racai/distilbert-base-romanian-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=512)

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
test_encodings = tokenize(test_texts)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, 2)
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model on: {device}")

stored_preds = []
stored_labels = []

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits[0] if isinstance(logits, tuple) else logits
    preds = np.argmax(logits, axis=1)

    global stored_preds, stored_labels
    stored_preds = preds
    stored_labels = labels

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted"),
    }

def plot_confusion_matrix(preds, labels, class_names=["fake_news", "real_news"]):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

training_args = TrainingArguments(
    output_dir="/drive/My Drive/results_distilbert_experiment_three",
    logging_dir="/drive/My Drive/logs_distilbert_experiment_three",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

metrics = trainer.evaluate(eval_dataset=test_dataset)
print(metrics)
plot_confusion_matrix(stored_preds, stored_labels)

model.save_pretrained("/drive/My Drive/distilbert_experiment_three")
tokenizer.save_pretrained("/drive/My Drive/distilbert_experiment_three")


In [None]:
from google.colab import drive
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

#EXPERIMENT FOUR
drive.mount('/drive')
df = pd.read_json("hf://datasets/mihalca/FakeRO_updated/combined_balanced.json")

label_map = {"real_news": 1, "fake_news": 0, "propaganda":2, "satire":3, "misinformation":4}
df["label"] = df["tag"].map(label_map)


train_texts, test_val_texts, train_labels, test_val_labels = train_test_split(
    df["content"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_val_texts, test_val_labels, test_size=0.5, stratify=test_val_labels, random_state=42
)


model_name = "racai/distilbert-base-romanian-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(texts):
    return tokenizer(list(texts), truncation=True, padding="max_length", max_length=512)

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
test_encodings = tokenize(test_texts)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, 5)
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model on: {device}")

stored_preds = []
stored_labels = []

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits[0] if isinstance(logits, tuple) else logits
    preds = np.argmax(logits, axis=1)

    global stored_preds, stored_labels
    stored_preds = preds
    stored_labels = labels

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro"),
        "f1": f1_score(labels, preds, average="macro"),
    }

def plot_confusion_matrix(preds, labels, class_names=["fake_news", "misinformation", "propaganda", "real_news","satire"]):
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

training_args = TrainingArguments(
    output_dir="/drive/My Drive/results_distilbert_experiment_four",
    logging_dir="/drive/My Drive/logs_distilbert_experiment_four",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.0,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

metrics = trainer.evaluate(eval_dataset=test_dataset)
print(metrics)
plot_confusion_matrix(stored_preds, stored_labels)

model.save_pretrained("/drive/My Drive/distilbert_experiment_four")
tokenizer.save_pretrained("/drive/My Drive/distilbert_experiment_four")
