In [1]:
#imports
!pip install transformers datasets torch scikit-learn pandas matplotlib
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime

np.random.seed(42)

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [4]:
df = pd.read_csv("FakeNewsNet_combined.csv", header = 0)

df['date'] = df['date'].str.split(' ').str[0]

#exluding dates after 2018
df = df[df['date'] <= "2018-12-31"].copy()

# Find split date
df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.date
df = df.dropna(subset=['date'])
split_date = df['date'].quantile(0.8)
print("Split Date: ", split_date)

def filter_by_date_range(df, start_date, end_date):
    return df[(df['date'] >= start_date) & (df['date'] <= end_date)]


# start_date = "2017-01-01"
# end_date = "2018-12-31"
# filtered_df = filter_by_date_range(df, start_date, end_date)


# label_map = {1: 1, 0: 0}
# filtered_df['label'] = filtered_df['label'].map(label_map)

# value_counts = filtered_df['label'].value_counts()
# print(value_counts)

# train_texts, test_texts, train_labels, test_labels = train_test_split(
#     filtered_df['title'].tolist(),
#     filtered_df['label'].tolist(),
#     test_size=0.2,
#     random_state=42
# )


Split Date:  2018-06-04


In [5]:
# Baseline training set: entries w/ date <= split_date
baseline_train = df[df['date'] <= split_date].copy()

newer_data = df[df['date'] > split_date].copy()

print("Baseline data:", len(baseline_train))
print("Newer data:", len(newer_data))

# Split the newer data into two non-overlapping subsets:
#   - cl_data: used for secondary model with a continual learning update
#   - holdout_data: reserved for final evaluation
cl_data, holdout_data = train_test_split(newer_data, test_size=0.5, random_state=42)
print("Update data:", len(cl_data))
print("test data:", len(holdout_data))


Baseline data: 8460
Newer data: 2099
Update data: 1049
test data: 1050


In [6]:
label_map = {0: 0, 1: 1}
df['label'] = df['label'].map(label_map)
baseline_train['label'] = baseline_train['label'].map(label_map)
newer_data['label'] = newer_data['label'].map(label_map)
cl_data['label'] = cl_data['label'].map(label_map)
holdout_data['label'] = holdout_data['label'].map(label_map)


train_texts = baseline_train['title'].tolist()
train_labels = baseline_train['label'].tolist()
eval_texts = holdout_data['title'].tolist()
eval_labels = holdout_data['label'].tolist()



print("Baseline distribution:")
print(baseline_train['label'].value_counts())
print("Newer data dist:")
print(newer_data['label'].value_counts())

Baseline distribution:
label
1    6145
0    2315
Name: count, dtype: int64
Newer data dist:
label
1    1607
0     492
Name: count, dtype: int64


In [7]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
eval_encodings = tokenize_function(eval_texts)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels,
})

test_dataset = Dataset.from_dict({
    "input_ids": eval_encodings["input_ids"],
    "attention_mask": eval_encodings["attention_mask"],
    "labels": eval_labels,
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert-fake-news",
    run_name="initialTestRun_BASE_BERT",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4557,0.439308,0.800952
2,0.4367,0.543893,0.840952
3,0.1927,0.764589,0.814286


TrainOutput(global_step=3174, training_loss=0.33278883815224847, metrics={'train_runtime': 728.5392, 'train_samples_per_second': 34.837, 'train_steps_per_second': 4.357, 'total_flos': 1669439646259200.0, 'train_loss': 0.33278883815224847, 'epoch': 3.0})

In [10]:

baseline_results = trainer.evaluate()

print(f"Baseline Test Accuracy: {baseline_results['eval_accuracy']:.4f}")


baseline_predictions = trainer.predict(test_dataset).predictions
baseline_pred_labels = np.argmax(baseline_predictions, axis=1)

print(classification_report(eval_labels, baseline_pred_labels, target_names=["Fake", "Real"]))


Baseline Test Accuracy: 0.8143
              precision    recall  f1-score   support

        Fake       0.62      0.62      0.62       256
        Real       0.88      0.88      0.88       794

    accuracy                           0.81      1050
   macro avg       0.75      0.75      0.75      1050
weighted avg       0.81      0.81      0.81      1050



In [11]:
from transformers import BertForSequenceClassification

model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")


('fine_tuned_bert/tokenizer_config.json',
 'fine_tuned_bert/special_tokens_map.json',
 'fine_tuned_bert/vocab.txt',
 'fine_tuned_bert/added_tokens.json')

In [12]:
import shutil
shutil.make_archive('fine_tuned_bert_initial', 'zip', "fine_tuned_bert")

'/content/fine_tuned_bert_initial.zip'

Secondary Model Training and Evaluation


In [13]:
cl_texts = cl_data['title'].tolist()
cl_labels = cl_data['label'].tolist()

cl_encodings = tokenize_function(cl_texts)

cl_dataset = Dataset.from_dict({
    "input_ids": cl_encodings["input_ids"],
    "attention_mask": cl_encodings["attention_mask"],
    "labels": cl_labels,
})


In [14]:
cl_training_args = TrainingArguments(
    output_dir="./bert-fake-news-continual",
    run_name="continualLearningUpdate",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
cl_trainer = Trainer(
    model=model,  # continue training same model
    args=cl_training_args,
    train_dataset=cl_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

cl_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3775,0.386201,0.835238
2,0.2601,0.523178,0.835238


TrainOutput(global_step=264, training_loss=0.31415867195888, metrics={'train_runtime': 92.9182, 'train_samples_per_second': 22.579, 'train_steps_per_second': 2.841, 'total_flos': 138001748536320.0, 'train_loss': 0.31415867195888, 'epoch': 2.0})

In [17]:
# Evaluate updated model
updated_results = cl_trainer.evaluate()
print(f"Updated Model Accuracy: {updated_results['eval_accuracy']:.4f}")

updated_predictions = cl_trainer.predict(test_dataset).predictions
updated_pred_labels = np.argmax(updated_predictions, axis=1)
print(classification_report(eval_labels, updated_pred_labels, target_names=["Fake", "Real"]))

Updated Model Accuracy: 0.8352
              precision    recall  f1-score   support

        Fake       0.71      0.55      0.62       256
        Real       0.86      0.93      0.89       794

    accuracy                           0.84      1050
   macro avg       0.79      0.74      0.76      1050
weighted avg       0.83      0.84      0.83      1050



In [18]:
# Save model
model.save_pretrained("fine_tuned_bert_continual")
tokenizer.save_pretrained("fine_tuned_bert_continual")

shutil.make_archive('fine_tuned_bert_continual', 'zip', "fine_tuned_bert_continual")

'/content/fine_tuned_bert_continual.zip'