## Imports

Here import all crucial packages etc.

In [None]:
import json
import os
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.metrics import f1_score
import torch
from transformers import EvalPrediction, pipeline
from sklearn.metrics import precision_recall_fscore_support

## Utils

Helper functions that you will use

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
class DisinformationDataset(torch.utils.data.Dataset):
    """
    This class wraps our tokenized data and labels so PyTorch can easily loop through them during training. It converts each input into tensors and returns them with the label — all in the format the model expects.
    """
    # When we create an instance of dataset, we pass in encodings and labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # This method tells PyTorch how to get one item (input + label).
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    # Returns how many examples are in the dataset (needed by DataLoader).
    def __len__(self):
        return len(self.labels)


def load_and_process_data(file_path: str, label_column: str = "label") -> pd.DataFrame:
    """
    Loads the data from a CSV file and processes the labels.
    Args:
        file_path (str): Path to the CSV file.
        label_column (str): The column name containing the labels.
        text_column (str): The column name containing the text content.
    Returns:
        pd.DataFrame: Processed dataframe with labels and text content.
    """
    data = pd.read_csv(file_path, encoding='utf-8')
    data[label_column] = data[label_column].apply(lambda x: 1 if "fake" in x.lower() else 0)
    return data


def save_metrics_to_json(metrics: dict, output_file_path: str):
    """
    Saves the metrics to a JSON file.
    Args:
        metrics (dict): The evaluation metrics.
        output_file_path (str): The file path to save the metrics.
    """
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as output_file:
        json.dump(metrics, output_file, indent=4)

In [None]:
def compute_metrics(pred=None, y_true=None, y_pred=None):
    """
    Computes F1 scores (micro, macro, weighted) for both training and testing data.

    If `pred` is provided, it computes metrics for the trainer using `EvalPrediction`.
    If `y_true` and `y_pred` are provided, it computes metrics for test data predictions.

    Parameters:
        - pred (EvalPrediction, optional): The evaluation prediction object for Trainer.
        - y_true (list, optional): The ground truth labels for the test data.
        - y_pred (list, optional): The predicted labels for the test data.

    Returns:
        - dict: A dictionary containing F1 metrics.
    """
    if pred is not None:
        # When working with the Trainer, pred is an EvalPrediction object
        labels = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
    elif y_true is not None and y_pred is not None:
        # If y_true and y_pred are provided, use them for test evaluation
        labels = y_true
    else:
        raise ValueError("Either `pred` or both `y_true` and `y_pred` must be provided.")

        # Compute F1 scores
    #f1 = f1_score(y_true=labels, y_pred=y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, y_pred, average='macro')

    return {
        'f1': f1,
        'recall' : recall,
        'precision' : precision
    }

def compute_metrics_for_trainer(pred: EvalPrediction):
    return compute_metrics(pred=pred)

# Assignment

# Fine-Tuning BERT Model to Fake News detection

## Import Train, Validation and Test data

Import all datasets and load and preprocess train and validation

Link to direcotry with data: https://github.com/ArkadiusDS/NLP-Labs/tree/master/data/CoAID/

In [None]:
fake_test = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/refs/heads/master/data/CoAID/test.csv'
fake_train = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/refs/heads/master/data/CoAID/train.csv'
fake_valid = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/refs/heads/master/data/CoAID/validation.csv'

!wget -O test.csv {fake_test}
!wget -O train.csv {fake_train}
!wget -O validation.csv {fake_valid}

--2025-05-15 13:07:48--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/refs/heads/master/data/CoAID/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221757 (217K) [text/plain]
Saving to: ‘test.csv’


2025-05-15 13:07:48 (66.1 MB/s) - ‘test.csv’ saved [221757/221757]

--2025-05-15 13:07:48--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/refs/heads/master/data/CoAID/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1556530 (1.5M) [text/plain]
Saving to: ‘train.csv’


2025-05-15 13:07:48 (134 MB/s) - 

In [None]:
# Load and process the training data
train_data = load_and_process_data('train.csv')

# Load and process the validation data
validation_data = load_and_process_data('validation.csv')

## Load model and tokenizer

Firstly create two dicts id2label and label2id and then load model and tokenizer
Then use well-known distilled version of BERT model for faster fine-tuning: 'distilbert/distilbert-base-uncased' or any other model you wish.

In [None]:
id2label = {0: "Credible", 1: "Fake"}
label2id = {"Credible": 0, "Fake": 1}

model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Tokenize datasets and prepare it for fine-tuning

You may use DisinformationDataset class for data preparation.

In [None]:
train_encodings = tokenizer(
        train_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

val_encodings = tokenizer(
        validation_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

In [None]:
train_dataset = DisinformationDataset(train_encodings, train_data['label'].tolist())

val_dataset = DisinformationDataset(val_encodings, validation_data['label'].tolist())

## Fine-tune BERT model on at least 3 sets of hyperparameters

Check F1 score, precision and recall for each fine-tuned model and at the end choose set of hyperparameters that gives you best results. For each set of hyperparameters write down the final metrics. You need to acheive at least below result on validation dataset:

"f1": 0.91,
"recall": 0.91,
"precision": 0.91

Remember you need to achieve these minimum results on VALIDATION dataset and the best model on validation dataset will have to be used for predictions on test dataset.


In [None]:
training_args = TrainingArguments(
    output_dir='output/training/',
    eval_strategy='steps',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    warmup_ratio=0.06,
    weight_decay=0.001,
    fp16=True,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    save_total_limit=2,
    greater_is_better=True,
    save_strategy='steps',
    eval_steps=100,
    save_on_each_node=True,
    report_to=[]
)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_trainer
    )

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,F1,Recall,Precision
100,No log,0.29406,0.894654,0.853031,0.961529
200,No log,0.070984,0.968766,0.960278,0.977886
300,No log,0.097952,0.972794,0.979649,0.966315
400,No log,0.138049,0.966636,0.950592,0.985057
500,0.030700,0.104932,0.971889,0.963344,0.981069
600,0.030700,0.164254,0.966506,0.94878,0.987164
700,0.030700,0.156984,0.968163,0.95122,0.98776
800,0.030700,0.174802,0.964843,0.946341,0.986569
900,0.030700,0.175986,0.964843,0.946341,0.986569
1000,0.000700,0.175299,0.964843,0.946341,0.986569


TrainOutput(global_step=1100, training_loss=0.014248377861328085, metrics={'train_runtime': 290.6967, 'train_samples_per_second': 60.321, 'train_steps_per_second': 3.784, 'total_flos': 2306826177868800.0, 'train_loss': 0.014248377861328085, 'epoch': 5.0})

In [None]:
model_saved_path='output/experiment_2/'
trainer.save_model(model_saved_path)
tokenizer.save_pretrained(model_saved_path)

('output/experiment_2/tokenizer_config.json',
 'output/experiment_2/special_tokens_map.json',
 'output/experiment_2/vocab.txt',
 'output/experiment_2/added_tokens.json',
 'output/experiment_2/tokenizer.json')

In [None]:
test_data = load_and_process_data('validation.csv')
model_path = 'output/experiment_2/'
classifier = pipeline(
    task="text-classification",
    model= model_path,
    tokenizer= model_path,
    device=0,
    truncation=True,
    padding=True,
    max_length=256
)

results = classifier(test_data["content"].tolist(), batch_size=32)

test_data["predictions"] = [1 if r["label"] == "Fake" else 0 for r in results]

evaluation_results = compute_metrics(y_true=test_data["label"], y_pred=test_data["predictions"])
output_file_path = "output/experiment_2/results_2.json"
save_metrics_to_json(evaluation_results, output_file_path)

Device set to use cuda:0


## Final prediction on test dataset

Take best model and hyperparameters on validation and predict on test dataset. Compute evaluation metrics f1, precision and recall.

In [None]:
test_data = load_and_process_data('test.csv')

In [None]:
best_model_path = 'output/experiment_0/'

In [None]:
classifier = pipeline(
    task="text-classification",
    model=best_model_path,
    tokenizer=best_model_path,
    device=0,
    truncation=True,
    padding=True,
    max_length=256
)

results = classifier(test_data["content"].tolist(), batch_size=32)

test_data["predictions"] = [1 if r["label"] == "Fake" else 0 for r in results]

Device set to use cuda:0


In [None]:
evaluation_results = compute_metrics(y_true=test_data["label"], y_pred=test_data["predictions"])
output_file_path = "metrics/results.json"
save_metrics_to_json(evaluation_results, output_file_path)

# Final file with results and description

In [1]:
import json

All keys in your dictionary have to be the same as below. The only changes you should do in terms of keys is changing names of hyperparameters, e.g. instead of key "name_of_hyperparameter_0" if you used learning rate then write "learning_rate". Other important information in the dictionary below and comments. Each value says what is expected.

Example dictionary provided under the template.

Template for your structured resulting file

In [2]:
data = {
    # Everything in experiment_0 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_0": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "0.00001",
            "per_device_train_batch_size": "16",
            "per_device_eval_batch_size": "16",
            "weight_decay": "0.1"
        },
        "f1_score": "0.9782982673267326",
        "precision": "0.982372309868145",
        "recall": "0.9731003458089788",
        "description": "Experiment using a low learning rate and standard batch size. High F1 and balanced precision/recall indicate strong generalization on the validation set."
    },
    # Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_1": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "5e-5",
            "per_device_train_batch_size": "32",
            "per_device_eval_batch_size": "32",
            "weight_decay": "0.001"
        },
        "f1": "0.9730952753256477",
        "recall": "0.9585365853658536",
        "precision": "0.9895577395577395",
        "description": "Increased learning rate and batch size. Achieved very high precision, but slightly lower recall suggests more false negatives."
    },
    # Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_2": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "5e-5",
            "per_device_train_batch_size": "16",
            "per_device_eval_batch_size": "16",
            "weight_decay": "0.001"
        },
        "f1": "0.9648425538430132",
        "recall": "0.9463414634146341",
        "precision": "0.9865689865689866",
        "description": "Same learning rate as experiment_1 but smaller batch size. Slightly lower F1 suggests this configuration less balanced compared to other setups."
    },
    # Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc.
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0",
        "hyperparameters": {
            "learning_rate": "0.00001",
            "per_device_train_batch_size": "16",
            "per_device_eval_batch_size": "16",
            "weight_decay": "0.1"
        },
        "f1": "0.9696801314263626",
        "recall": "0.9730637272793634",
        "precision": "0.9663907880532565",
        "description": "Final evaluation on the test set using the best-performing configuration from experiment_0. Maintains high F1 and balanced performance, confirming strong generalization."
    }
}


In [3]:
with open("experiments_Davide_Volpi_2140728.json", "w") as f:
    json.dump(data, f, indent=4)

## Example final file

In [None]:
data = {
    "experiment_0": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "This experiment fine-tuned the google-bert/bert-base-uncased model for binary classification using a learning rate of 1e-5 and a warmup ratio of 0.06. The model achieved an F1-score of 0.76, with a strong recall of 0.85, indicating high sensitivity to positive cases. Precision was moderate at 0.65, suggesting some trade-off in false positives. The setup demonstrates effective recall-oriented performance in identifying relevant instances."
    },
    "experiment_1": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description two of the approach - it has to be different for each experiment. Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "experiment_2": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "num_train_epochs": "int",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description three of the approach - it has to be different for each experiment. Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description four of the final results and prediction - it has to be different and here you will describe results on test dataset. Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc."
    }
}

In [None]:
with open("experiments_Arkadiusz_Modzelewski_29580.json", "w") as f:
    json.dump(data, f, indent=4)