In [1]:
#imports
!pip install transformers datasets torch scikit-learn pandas matplotlib
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime

np.random.seed(42)

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
!pip install -qU faiss-cpu sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [4]:
df = pd.read_csv("Liar2_combined.csv", header = 0)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df = df.dropna(subset=['date'])

print(df.head())


   label                                              title       date
0      1  90 percent of Americans "support universal bac... 2017-10-02
1      0  Last year was one of the deadliest years ever ... 2017-05-19
2      0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
3      1  Voter ID is supported by an overwhelming major... 2021-12-08
4      0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12


In [5]:
#Defining our Date Ranges
baseline_start, baseline_end = '2007-01-01', '2015-12-31'
update1_start, update1_end   = '2016-01-01', '2017-12-31'
update2_start, update2_end   = '2018-01-01', '2019-12-31'
update3_start, update3_end   = '2020-01-01', '2021-12-31'
update4_start, update4_end   = '2022-01-01', '2022-12-31'
test_start, test_end         = '2023-01-01', '2023-12-31'

# Baseline training set: entries w/ date <= split_date
baseline_df = df[(df['date'] >= baseline_start) & (df['date'] <= baseline_end)].copy()
update1_df = df[(df['date'] >= update1_start) & (df['date'] <= update1_end)].copy()
update2_df = df[(df['date'] >= update2_start) & (df['date'] <= update2_end)].copy()
update3_df = df[(df['date'] >= update3_start) & (df['date'] <= update3_end)].copy()
update4_df = df[(df['date'] >= update4_start) & (df['date'] <= update4_end)].copy()
test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)].copy()

# Display sample sizes for each block
print("Baseline samples:", len(baseline_df))
print("Update 1 samples:", len(update1_df))
print("Update 2 samples:", len(update2_df))
print("Update 3 samples:", len(update3_df))
print("Update 4 samples:", len(update4_df))
print("Test samples:", len(test_df))

Baseline samples: 10932
Update 1 samples: 3031
Update 2 samples: 2730
Update 3 samples: 3772
Update 4 samples: 1688
Test samples: 807


In [6]:
print("Baseline distribution:")
print(baseline_df['label'].value_counts())

Baseline distribution:
label
1    6147
0    4785
Name: count, dtype: int64


In [7]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
#Baseline Processing

baseline_texts = baseline_df['title'].tolist()
baseline_labels = baseline_df['label'].tolist()


baseline_encodings = tokenize_function(baseline_texts)

# Convert to Hugging Face Dataset format
baseline_dataset = Dataset.from_dict({
    "input_ids": baseline_encodings["input_ids"],
    "attention_mask": baseline_encodings["attention_mask"],
    "labels": baseline_labels,
})


In [9]:
# Test Dataset Processing

test_texts = test_df['title'].tolist()
test_labels = test_df['label'].tolist()


test_encodings = tokenize_function(test_texts)

# Convert to Hugging Face Dataset format
test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": test_labels,
})


In [None]:
# Trainining Baseline Model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert_baseline",
    run_name="baseline_training",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Training

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=baseline_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6847,0.55747,0.812887
2,0.5587,0.461265,0.790582
3,0.4562,0.510776,0.76456


TrainOutput(global_step=4101, training_loss=0.5561170930747316, metrics={'train_runtime': 907.9283, 'train_samples_per_second': 36.122, 'train_steps_per_second': 4.517, 'total_flos': 2157247542896640.0, 'train_loss': 0.5561170930747316, 'epoch': 3.0})

In [None]:
# Evaluate Baseline Model
baseline_results = trainer.evaluate()
print(f"Baseline Test Accuracy: {baseline_results['eval_accuracy']:.4f}")

baseline_predictions = trainer.predict(test_dataset).predictions
baseline_pred_labels = np.argmax(baseline_predictions, axis=1)

print(classification_report(test_labels, baseline_pred_labels, target_names=["Fake", "Real"]))


Baseline Test Accuracy: 0.7646
              precision    recall  f1-score   support

        Fake       0.95      0.77      0.85       705
        Real       0.32      0.75      0.44       102

    accuracy                           0.76       807
   macro avg       0.64      0.76      0.65       807
weighted avg       0.87      0.76      0.80       807



In [None]:
from transformers import BertForSequenceClassification

model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")


('fine_tuned_bert/tokenizer_config.json',
 'fine_tuned_bert/special_tokens_map.json',
 'fine_tuned_bert/vocab.txt',
 'fine_tuned_bert/added_tokens.json')

In [None]:
import shutil
shutil.make_archive('fine_tuned_bert_initial', 'zip', "fine_tuned_bert")

'/content/fine_tuned_bert_initial.zip'

# Continual Model Training and Evaluation

In [None]:
def prepare_update_dataset(df_subset):
    texts = df_subset['title'].tolist()
    labels = df_subset['label'].tolist()
    encodings = tokenize_function(texts)
    dataset = Dataset.from_dict({
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels,
    })
    return dataset

In [None]:
# Training args for continual updates

update_training_args = TrainingArguments(
    output_dir="./bert_continual",
    run_name="continual_update",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Function to fine-tune the model on a given update dataset

def fine_tune_on_update(model, update_dataset, update_name):
    print(f"\n--- Fine-tuning on {update_name} ---")

    update_trainer = Trainer(
        model=model,
        args=update_training_args,
        train_dataset=update_dataset,
        eval_dataset=test_dataset,  # Evaluate on the unified test set
        compute_metrics=compute_metrics,
    )

    update_trainer.train()


    results = update_trainer.evaluate()
    print(f"{update_name} - Test Accuracy: {results['eval_accuracy']:.4f}")

    preds = update_trainer.predict(test_dataset).predictions
    pred_labels = np.argmax(preds, axis=1)
    print(classification_report(test_labels, pred_labels, target_names=["Fake", "Real"]))

    # Save model
    model_save_path = f"fine_tuned_bert_{update_name.replace(' ', '_').lower()}"
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    shutil.make_archive(model_save_path, 'zip', model_save_path)

    return model


In [None]:
# Updates

update1_dataset = prepare_update_dataset(update1_df)
update2_dataset = prepare_update_dataset(update2_df)
update3_dataset = prepare_update_dataset(update3_df)
update4_dataset = prepare_update_dataset(update4_df)

# Sequentially fine-tune
model = fine_tune_on_update(model, update1_dataset, "Update 1 (2016-2017)")
model = fine_tune_on_update(model, update2_dataset, "Update 2 (2018-2019)")
model = fine_tune_on_update(model, update3_dataset, "Update 3 (2020-2021)")
model = fine_tune_on_update(model, update4_dataset, "Update 4 (2022)")


--- Fine-tuning on Update 1 (2016-2017) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6585,0.383663,0.815366
2,0.5723,0.390861,0.811648


Update 1 (2016-2017) - Test Accuracy: 0.8116
              precision    recall  f1-score   support

        Fake       0.96      0.81      0.88       705
        Real       0.38      0.79      0.52       102

    accuracy                           0.81       807
   macro avg       0.67      0.80      0.70       807
weighted avg       0.89      0.81      0.84       807


--- Fine-tuning on Update 2 (2018-2019) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6182,0.285984,0.889715
2,0.4757,0.295047,0.864932


Update 2 (2018-2019) - Test Accuracy: 0.8649
              precision    recall  f1-score   support

        Fake       0.95      0.89      0.92       705
        Real       0.48      0.67      0.56       102

    accuracy                           0.86       807
   macro avg       0.71      0.78      0.74       807
weighted avg       0.89      0.86      0.87       807


--- Fine-tuning on Update 3 (2020-2021) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.451,0.275917,0.876084
2,0.2973,0.280066,0.89715


Update 3 (2020-2021) - Test Accuracy: 0.8971
              precision    recall  f1-score   support

        Fake       0.94      0.94      0.94       705
        Real       0.60      0.58      0.59       102

    accuracy                           0.90       807
   macro avg       0.77      0.76      0.76       807
weighted avg       0.90      0.90      0.90       807


--- Fine-tuning on Update 4 (2022) ---


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4247,0.228538,0.903346
2,0.1609,0.310626,0.908302


Update 4 (2022) - Test Accuracy: 0.9083
              precision    recall  f1-score   support

        Fake       0.93      0.96      0.95       705
        Real       0.68      0.53      0.59       102

    accuracy                           0.91       807
   macro avg       0.80      0.75      0.77       807
weighted avg       0.90      0.91      0.90       807



In [None]:
# Final eval on the test set

final_results = trainer.evaluate()
print(f"\nFinal Updated Model Test Accuracy: {final_results['eval_accuracy']:.4f}")
final_preds = trainer.predict(test_dataset).predictions
final_pred_labels = np.argmax(final_preds, axis=1)
print(classification_report(test_labels, final_pred_labels, target_names=["Fake", "Real"]))


model.save_pretrained("fine_tuned_bert_continual")
tokenizer.save_pretrained("fine_tuned_bert_continual")



Final Updated Model Test Accuracy: 0.9083
              precision    recall  f1-score   support

        Fake       0.93      0.96      0.95       705
        Real       0.68      0.53      0.59       102

    accuracy                           0.91       807
   macro avg       0.80      0.75      0.77       807
weighted avg       0.90      0.91      0.90       807



('fine_tuned_bert_continual/tokenizer_config.json',
 'fine_tuned_bert_continual/special_tokens_map.json',
 'fine_tuned_bert_continual/vocab.txt',
 'fine_tuned_bert_continual/added_tokens.json')

In [10]:
import faiss
import json
from datetime import datetime

index = faiss.read_index('faiss_index.index')

file_path = 'News_Category_Dataset_v3.json'

articles = []
headlines = []
metadata = []
with open(file_path, 'r') as f:
    for line in f:
        art = json.loads(line)
        articles.append(art)

        headline = art['headline'].strip().lower()
        headlines.append(headline)
        # Convert the date to a datetime object for further filtering if needed.
        art_date = art.get('date', None)
        if art_date:
            try:
                art_date = datetime.strptime(art_date, '%Y-%m-%d')
            except Exception as e:
                print(f"Error parsing date for article: {art_date} | {e}")
        metadata.append({
            'link': art.get('link', ''),
            'date': art_date,
            'category': art.get('category', ''),
            'short_description': art.get('short_description', ''),
            'authors': art.get('authors', '')
        })


In [11]:
from sentence_transformers import SentenceTransformer

search_model = SentenceTransformer("all-MiniLM-L6-v2") #embedding model
def search_similar_articles(query_headline, model, k=3):
    query = query_headline.strip().lower()
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Search in FAISS
    distances, indices = index.search(query_embedding, k)

    results = []
    for rank, idx in enumerate(indices[0]):
        if idx == -1:  # If FAISS returns an empty result
            continue
        results.append(headlines[idx] + ' ' + metadata[idx]['short_description'])
    return results


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

RAG

In [12]:
test_query = "Over 4 million Americans get Omicron boosters"
results = search_similar_articles(test_query, search_model, k=3)

print(results)


['over 4 million americans roll up sleeves for omicron-targeted covid boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'u.s. added 678,000 jobs in february as omicron eases Unemployment fell to 3.8% as the economy continues its pandemic recovery.', 'more countries scramble to curb omicron The new COVID-19 variant was identified in South Africa two weeks ago.']


In [13]:
def prepare_input(article, facts, tokenizer, max_length=512):

    article_tokens = tokenizer.encode(article, add_special_tokens=False)

    fact_tokens_list = [tokenizer.encode(fact, add_special_tokens=False) for fact in facts]

    # Start with [CLS], then article, then [SEP]
    input_ids = [tokenizer.cls_token_id] + article_tokens + [tokenizer.sep_token_id]
    token_type_ids = [0] * (len(article_tokens) + 2) # 0 for article, 1 for facts

    for fact_tokens in fact_tokens_list:
        input_ids += fact_tokens + [tokenizer.sep_token_id]
        token_type_ids += [1] * (len(fact_tokens) + 1)
    # Format: [CLS] article [SEP] fact1 [SEP] fact 2 [SEP] ...

    attention_mask = [1] * len(input_ids)

    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
        token_type_ids = token_type_ids[:max_length]
        attention_mask = attention_mask[:max_length]
    else:
        pad_length = max_length - len(input_ids)
        input_ids = input_ids + [tokenizer.pad_token_id] * pad_length
        token_type_ids = token_type_ids + [0] * pad_length
        attention_mask = attention_mask + [0] * pad_length

    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask)
    }


In [21]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        article = row['title']
        label = row['label']

        # Retrieve the top 3 supporting facts for this article
        facts = search_similar_articles(article,search_model, k=3)
        print(facts)
        encoding = prepare_input(article, facts, self.tokenizer, self.max_length)
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        return encoding


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:

# Create dataset objects for training and testing
train_dataset = FakeNewsDataset(baseline_df, tokenizer, max_length=512)
test_dataset = FakeNewsDataset(test_df, tokenizer, max_length=512)

# =============================================================================
# Set up training arguments and the Trainer
# =============================================================================
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss
500,0.6785,0.552653
1000,0.6635,0.437285
1500,0.6366,0.5907
2000,0.5969,0.446227
2500,0.5728,0.537476
3000,0.4739,0.480744
3500,0.4003,0.496669
4000,0.3956,0.502628


TrainOutput(global_step=4101, training_loss=0.5488913088884333, metrics={'train_runtime': 2187.4204, 'train_samples_per_second': 14.993, 'train_steps_per_second': 1.875, 'total_flos': 8628990171586560.0, 'train_loss': 0.5488913088884333, 'epoch': 3.0})

In [17]:
# Evaluate Baseline Model
baseline_results = trainer.evaluate()


baseline_predictions = trainer.predict(test_dataset).predictions
baseline_pred_labels = np.argmax(baseline_predictions, axis=1)

print(classification_report(test_labels, baseline_pred_labels, target_names=["Fake", "Real"]))

              precision    recall  f1-score   support

        Fake       0.95      0.78      0.86       705
        Real       0.32      0.72      0.44       102

    accuracy                           0.77       807
   macro avg       0.64      0.75      0.65       807
weighted avg       0.87      0.77      0.80       807



In [27]:
# Create dataset objects for training and testing
train_dataset = FakeNewsDataset(baseline_df, tokenizer, max_length=512)
test_dataset = FakeNewsDataset(test_df, tokenizer, max_length=512)

value_counts = test_df['label'].value_counts()

print(value_counts)

label
0    705
1    102
Name: count, dtype: int64
