In [None]:
!pip install transformers datasets torch scikit-learn pandas matplotlib faiss-cpu sentence-transformers

import os
import json
import shutil
import pickle
import zipfile
import numpy as np
import pandas as pd
import torch
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer



In [None]:
os.environ["WANDB_DISABLED"] = "true"

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7c8191587130>

In [None]:
# Data Loading & Splitting
df = pd.read_csv("Liar2_combined.csv", header=0)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df = df.dropna(subset=['date'])
print(df.head())

# Define date ranges
baseline_start, baseline_end = '2007-01-01', '2015-12-31'
update1_start, update1_end   = '2016-01-01', '2017-12-31'
update2_start, update2_end   = '2018-01-01', '2019-12-31'
update3_start, update3_end   = '2020-01-01', '2021-12-31'
update4_start, update4_end   = '2022-01-01', '2022-12-31'
test_start, test_end         = '2023-01-01', '2023-12-31'

# Create splits
baseline_df = df[(df['date'] >= baseline_start) & (df['date'] <= baseline_end)].copy()
update1_df  = df[(df['date'] >= update1_start)  & (df['date'] <= update1_end)].copy()
update2_df  = df[(df['date'] >= update2_start)  & (df['date'] <= update2_end)].copy()
update3_df  = df[(df['date'] >= update3_start)  & (df['date'] <= update3_end)].copy()
update4_df  = df[(df['date'] >= update4_start)  & (df['date'] <= update4_end)].copy()
test_df     = df[(df['date'] >= test_start)     & (df['date'] <= test_end)].copy()

print("Baseline samples:", len(baseline_df))
print("Update 1 samples:", len(update1_df))
print("Update 2 samples:", len(update2_df))
print("Update 3 samples:", len(update3_df))
print("Update 4 samples:", len(update4_df))
print("Test samples:", len(test_df))

   label                                              title       date
0      1  90 percent of Americans "support universal bac... 2017-10-02
1      0  Last year was one of the deadliest years ever ... 2017-05-19
2      0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
3      1  Voter ID is supported by an overwhelming major... 2021-12-08
4      0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12
Baseline samples: 10932
Update 1 samples: 3031
Update 2 samples: 2730
Update 3 samples: 3772
Update 4 samples: 1688
Test samples: 807


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

def prepare_dataset(df_subset):
    texts = df_subset['title'].tolist()
    labels = df_subset['label'].tolist()
    encodings = tokenize_function(texts)
    dataset = Dataset.from_dict({
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels,
    })
    return dataset

In [None]:
# Continual Learning Training Functions
baseline_training_args = TrainingArguments(
    output_dir="./bert_baseline",
    run_name="baseline_training",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
)

update_training_args = TrainingArguments(
    output_dir="./bert_continual",
    run_name="continual_update",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

def fine_tune_on_update(model, update_dataset, update_name, eval_dataset):
    print(f"\n--- Fine-tuning on {update_name} ---")
    update_trainer = Trainer(
        model=model,
        args=update_training_args,
        train_dataset=update_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )
    update_trainer.train()
    results = update_trainer.evaluate()
    print(f"{update_name} - Test Accuracy: {results['eval_accuracy']:.4f}")

    preds = update_trainer.predict(eval_dataset).predictions
    pred_labels = np.argmax(preds, axis=1)
    print(classification_report(eval_dataset["labels"], pred_labels, target_names=["Fake", "Real"]))

    model_save_path = f"fine_tuned_bert_{update_name.replace(' ', '_').lower()}"
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    shutil.make_archive(model_save_path, 'zip', model_save_path)
    return model

def evaluate_on_period(model, period_name, df_period):
    print(f"\n--- Evaluating on {period_name} ---")
    dataset = prepare_dataset(df_period)
    trainer = Trainer(
        model=model,
        args=baseline_training_args,
        eval_dataset=dataset,
        compute_metrics=compute_metrics,
    )
    results = trainer.evaluate()
    preds = trainer.predict(dataset).predictions
    pred_labels = np.argmax(preds, axis=1)
    report = classification_report(dataset["labels"], pred_labels, target_names=["Fake", "Real"])
    print(f"{period_name} Accuracy: {results['eval_accuracy']:.4f}")
    print(report)
    return results, report

def rolling_evaluation_CL(model, model_version):
    # Define the rolling evaluation order for the Continual Learning pipeline.
    # For a given model version, we evaluate on all subsequent time periods.
    rolling_mapping = {
        "Baseline": [("Update 1", update1_df), ("Update 2", update2_df), ("Update 3", update3_df), ("Update 4", update4_df), ("Test", test_df)],
        "Update 1": [("Update 1", update1_df), ("Update 2", update2_df), ("Update 3", update3_df), ("Update 4", update4_df), ("Test", test_df)],
        "Update 2": [("Update 2", update2_df), ("Update 3", update3_df), ("Update 4", update4_df), ("Test", test_df)],
        "Update 3": [("Update 3", update3_df), ("Update 4", update4_df), ("Test", test_df)],
        "Update 4": [("Update 4", update4_df), ("Test", test_df)],
    }
    evaluations = {}
    if model_version in rolling_mapping:
        for period_name, df_period in rolling_mapping[model_version]:
            evaluations[period_name] = evaluate_on_period(model, period_name, df_period)
    else:
        print(f"No rolling evaluation defined for model version: {model_version}")
    return evaluations

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Continual Learning Experiments

In [None]:
baseline_dataset = prepare_dataset(baseline_df)
test_dataset_CL = prepare_dataset(test_df)

In [None]:
# Train Baseline Model
model_CL = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
trainer_CL = Trainer(
    model=model_CL,
    args=baseline_training_args,
    train_dataset=baseline_dataset,
    eval_dataset=test_dataset_CL,
    compute_metrics=compute_metrics,
)
trainer_CL.train()
baseline_results = trainer_CL.evaluate()
print(f"Baseline Test Accuracy: {baseline_results['eval_accuracy']:.4f}")
baseline_preds = trainer_CL.predict(test_dataset_CL).predictions
baseline_pred_labels = np.argmax(baseline_preds, axis=1)
print(classification_report(test_dataset_CL["labels"], baseline_pred_labels, target_names=["Fake", "Real"]))

# Save baseline model
model_CL.save_pretrained("fine_tuned_bert_baseline_CL")
tokenizer.save_pretrained("fine_tuned_bert_baseline_CL")
shutil.make_archive('fine_tuned_bert_baseline_CL', 'zip', "fine_tuned_bert_baseline_CL")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6779
1000,0.6673
1500,0.6347
2000,0.6018
2500,0.5814
3000,0.4946
3500,0.4329
4000,0.4121


Baseline Test Accuracy: 0.7695
              precision    recall  f1-score   support

        Fake       0.96      0.76      0.85       705
        Real       0.33      0.80      0.47       102

    accuracy                           0.77       807
   macro avg       0.65      0.78      0.66       807
weighted avg       0.88      0.77      0.80       807



'/content/fine_tuned_bert_baseline_CL.zip'

In [None]:
# Rolling evaluation for baseline model (evaluate on later periods)
print("\nRolling Evaluation for Baseline Model:")
rolling_evaluation_CL(model_CL, "Baseline")


Rolling Evaluation for Baseline Model:

--- Evaluating on Update 1 ---


Update 1 Accuracy: 0.6358
              precision    recall  f1-score   support

        Fake       0.69      0.52      0.60      1555
        Real       0.60      0.76      0.67      1476

    accuracy                           0.64      3031
   macro avg       0.65      0.64      0.63      3031
weighted avg       0.65      0.64      0.63      3031


--- Evaluating on Update 2 ---


Update 2 Accuracy: 0.6487
              precision    recall  f1-score   support

        Fake       0.79      0.62      0.69      1759
        Real       0.50      0.70      0.59       971

    accuracy                           0.65      2730
   macro avg       0.65      0.66      0.64      2730
weighted avg       0.69      0.65      0.66      2730


--- Evaluating on Update 3 ---


Update 3 Accuracy: 0.6972
              precision    recall  f1-score   support

        Fake       0.88      0.72      0.79      3006
        Real       0.36      0.62      0.45       766

    accuracy                           0.70      3772
   macro avg       0.62      0.67      0.62      3772
weighted avg       0.77      0.70      0.72      3772


--- Evaluating on Update 4 ---


Update 4 Accuracy: 0.7068
              precision    recall  f1-score   support

        Fake       0.92      0.71      0.80      1427
        Real       0.30      0.67      0.42       261

    accuracy                           0.71      1688
   macro avg       0.61      0.69      0.61      1688
weighted avg       0.83      0.71      0.74      1688


--- Evaluating on Test ---


Test Accuracy: 0.7695
              precision    recall  f1-score   support

        Fake       0.96      0.76      0.85       705
        Real       0.33      0.80      0.47       102

    accuracy                           0.77       807
   macro avg       0.65      0.78      0.66       807
weighted avg       0.88      0.77      0.80       807



{'Update 1': ({'eval_loss': 0.8237868547439575,
   'eval_model_preparation_time': 0.0043,
   'eval_accuracy': 0.6357637743319037,
   'eval_runtime': 20.6322,
   'eval_samples_per_second': 146.906,
   'eval_steps_per_second': 18.369},
  '              precision    recall  f1-score   support\n\n        Fake       0.69      0.52      0.60      1555\n        Real       0.60      0.76      0.67      1476\n\n    accuracy                           0.64      3031\n   macro avg       0.65      0.64      0.63      3031\nweighted avg       0.65      0.64      0.63      3031\n'),
 'Update 2': ({'eval_loss': 0.7852499485015869,
   'eval_model_preparation_time': 0.0028,
   'eval_accuracy': 0.6487179487179487,
   'eval_runtime': 19.0533,
   'eval_samples_per_second': 143.282,
   'eval_steps_per_second': 17.95},
  '              precision    recall  f1-score   support\n\n        Fake       0.79      0.62      0.69      1759\n        Real       0.50      0.70      0.59       971\n\n    accuracy        

In [None]:
# Update 1
update1_dataset = prepare_dataset(update1_df)
model_CL = fine_tune_on_update(model_CL, update1_dataset, "Update 1", test_dataset_CL)
print("\nRolling Evaluation for Update 1 Model:")
rolling_evaluation_CL(model_CL, "Update 1")


--- Fine-tuning on Update 1 ---


Step,Training Loss
500,0.5808


Update 1 - Test Accuracy: 0.8116
              precision    recall  f1-score   support

        Fake       0.96      0.82      0.88       705
        Real       0.38      0.75      0.50       102

    accuracy                           0.81       807
   macro avg       0.67      0.78      0.69       807
weighted avg       0.88      0.81      0.84       807


Rolling Evaluation for Update 1 Model:

--- Evaluating on Update 1 ---


Update 1 Accuracy: 0.8720
              precision    recall  f1-score   support

        Fake       0.94      0.80      0.86      1555
        Real       0.82      0.95      0.88      1476

    accuracy                           0.87      3031
   macro avg       0.88      0.87      0.87      3031
weighted avg       0.88      0.87      0.87      3031


--- Evaluating on Update 2 ---


Update 2 Accuracy: 0.6769
              precision    recall  f1-score   support

        Fake       0.81      0.65      0.72      1759
        Real       0.53      0.72      0.61       971

    accuracy                           0.68      2730
   macro avg       0.67      0.69      0.67      2730
weighted avg       0.71      0.68      0.68      2730


--- Evaluating on Update 3 ---


Update 3 Accuracy: 0.7452
              precision    recall  f1-score   support

        Fake       0.88      0.78      0.83      3006
        Real       0.41      0.60      0.49       766

    accuracy                           0.75      3772
   macro avg       0.65      0.69      0.66      3772
weighted avg       0.79      0.75      0.76      3772


--- Evaluating on Update 4 ---


Update 4 Accuracy: 0.7648
              precision    recall  f1-score   support

        Fake       0.93      0.78      0.85      1427
        Real       0.36      0.67      0.47       261

    accuracy                           0.76      1688
   macro avg       0.64      0.73      0.66      1688
weighted avg       0.84      0.76      0.79      1688


--- Evaluating on Test ---


Test Accuracy: 0.8116
              precision    recall  f1-score   support

        Fake       0.96      0.82      0.88       705
        Real       0.38      0.75      0.50       102

    accuracy                           0.81       807
   macro avg       0.67      0.78      0.69       807
weighted avg       0.88      0.81      0.84       807



{'Update 1': ({'eval_loss': 0.3140810430049896,
   'eval_model_preparation_time': 0.0027,
   'eval_accuracy': 0.8719894424282415,
   'eval_runtime': 20.3502,
   'eval_samples_per_second': 148.942,
   'eval_steps_per_second': 18.624},
  '              precision    recall  f1-score   support\n\n        Fake       0.94      0.80      0.86      1555\n        Real       0.82      0.95      0.88      1476\n\n    accuracy                           0.87      3031\n   macro avg       0.88      0.87      0.87      3031\nweighted avg       0.88      0.87      0.87      3031\n'),
 'Update 2': ({'eval_loss': 0.6788209080696106,
   'eval_model_preparation_time': 0.0027,
   'eval_accuracy': 0.676923076923077,
   'eval_runtime': 18.6608,
   'eval_samples_per_second': 146.296,
   'eval_steps_per_second': 18.327},
  '              precision    recall  f1-score   support\n\n        Fake       0.81      0.65      0.72      1759\n        Real       0.53      0.72      0.61       971\n\n    accuracy        

In [None]:
# Update 2
update2_dataset = prepare_dataset(update2_df)
model_CL = fine_tune_on_update(model_CL, update2_dataset, "Update 2", test_dataset_CL)
print("\nRolling Evaluation for Update 2 Model:")
rolling_evaluation_CL(model_CL, "Update 2")


--- Fine-tuning on Update 2 ---


Step,Training Loss
500,0.5111


Update 2 - Test Accuracy: 0.8625
              precision    recall  f1-score   support

        Fake       0.95      0.89      0.92       705
        Real       0.47      0.67      0.55       102

    accuracy                           0.86       807
   macro avg       0.71      0.78      0.73       807
weighted avg       0.89      0.86      0.87       807


Rolling Evaluation for Update 2 Model:

--- Evaluating on Update 2 ---


Update 2 Accuracy: 0.8897
              precision    recall  f1-score   support

        Fake       0.94      0.89      0.91      1759
        Real       0.81      0.90      0.85       971

    accuracy                           0.89      2730
   macro avg       0.88      0.89      0.88      2730
weighted avg       0.89      0.89      0.89      2730


--- Evaluating on Update 3 ---


Update 3 Accuracy: 0.7895
              precision    recall  f1-score   support

        Fake       0.88      0.85      0.87      3006
        Real       0.48      0.56      0.52       766

    accuracy                           0.79      3772
   macro avg       0.68      0.70      0.69      3772
weighted avg       0.80      0.79      0.79      3772


--- Evaluating on Update 4 ---


Update 4 Accuracy: 0.8081
              precision    recall  f1-score   support

        Fake       0.92      0.84      0.88      1427
        Real       0.42      0.62      0.50       261

    accuracy                           0.81      1688
   macro avg       0.67      0.73      0.69      1688
weighted avg       0.85      0.81      0.82      1688


--- Evaluating on Test ---


Test Accuracy: 0.8625
              precision    recall  f1-score   support

        Fake       0.95      0.89      0.92       705
        Real       0.47      0.67      0.55       102

    accuracy                           0.86       807
   macro avg       0.71      0.78      0.73       807
weighted avg       0.89      0.86      0.87       807



{'Update 2': ({'eval_loss': 0.27958858013153076,
   'eval_model_preparation_time': 0.0038,
   'eval_accuracy': 0.8897435897435897,
   'eval_runtime': 18.4703,
   'eval_samples_per_second': 147.805,
   'eval_steps_per_second': 18.516},
  '              precision    recall  f1-score   support\n\n        Fake       0.94      0.89      0.91      1759\n        Real       0.81      0.90      0.85       971\n\n    accuracy                           0.89      2730\n   macro avg       0.88      0.89      0.88      2730\nweighted avg       0.89      0.89      0.89      2730\n'),
 'Update 3': ({'eval_loss': 0.48752087354660034,
   'eval_model_preparation_time': 0.0041,
   'eval_accuracy': 0.7895015906680806,
   'eval_runtime': 25.9039,
   'eval_samples_per_second': 145.615,
   'eval_steps_per_second': 18.221},
  '              precision    recall  f1-score   support\n\n        Fake       0.88      0.85      0.87      3006\n        Real       0.48      0.56      0.52       766\n\n    accuracy     

In [None]:
# Update 3
update3_dataset = prepare_dataset(update3_df)
model_CL = fine_tune_on_update(model_CL, update3_dataset, "Update 3", test_dataset_CL)
print("\nRolling Evaluation for Update 3 Model:")
rolling_evaluation_CL(model_CL, "Update 3")


--- Fine-tuning on Update 3 ---


Step,Training Loss
500,0.4259


Update 3 - Test Accuracy: 0.9009
              precision    recall  f1-score   support

        Fake       0.94      0.94      0.94       705
        Real       0.61      0.60      0.60       102

    accuracy                           0.90       807
   macro avg       0.78      0.77      0.77       807
weighted avg       0.90      0.90      0.90       807


Rolling Evaluation for Update 3 Model:

--- Evaluating on Update 3 ---


Update 3 Accuracy: 0.9380
              precision    recall  f1-score   support

        Fake       0.95      0.98      0.96      3006
        Real       0.89      0.79      0.84       766

    accuracy                           0.94      3772
   macro avg       0.92      0.88      0.90      3772
weighted avg       0.94      0.94      0.94      3772


--- Evaluating on Update 4 ---


Update 4 Accuracy: 0.8472
              precision    recall  f1-score   support

        Fake       0.92      0.90      0.91      1427
        Real       0.51      0.57      0.53       261

    accuracy                           0.85      1688
   macro avg       0.71      0.73      0.72      1688
weighted avg       0.86      0.85      0.85      1688


--- Evaluating on Test ---


Test Accuracy: 0.9009
              precision    recall  f1-score   support

        Fake       0.94      0.94      0.94       705
        Real       0.61      0.60      0.60       102

    accuracy                           0.90       807
   macro avg       0.78      0.77      0.77       807
weighted avg       0.90      0.90      0.90       807



{'Update 3': ({'eval_loss': 0.19403164088726044,
   'eval_model_preparation_time': 0.0026,
   'eval_accuracy': 0.9379639448568399,
   'eval_runtime': 25.6986,
   'eval_samples_per_second': 146.778,
   'eval_steps_per_second': 18.367},
  '              precision    recall  f1-score   support\n\n        Fake       0.95      0.98      0.96      3006\n        Real       0.89      0.79      0.84       766\n\n    accuracy                           0.94      3772\n   macro avg       0.92      0.88      0.90      3772\nweighted avg       0.94      0.94      0.94      3772\n'),
 'Update 4': ({'eval_loss': 0.4446048140525818,
   'eval_model_preparation_time': 0.003,
   'eval_accuracy': 0.8471563981042654,
   'eval_runtime': 11.7106,
   'eval_samples_per_second': 144.143,
   'eval_steps_per_second': 18.018},
  '              precision    recall  f1-score   support\n\n        Fake       0.92      0.90      0.91      1427\n        Real       0.51      0.57      0.53       261\n\n    accuracy       

In [None]:
# Update 4
update4_dataset = prepare_dataset(update4_df)
model_CL = fine_tune_on_update(model_CL, update4_dataset, "Update 4", test_dataset_CL)
print("\nRolling Evaluation for Update 4 Model:")
rolling_evaluation_CL(model_CL, "Update 4")


--- Fine-tuning on Update 4 ---


Step,Training Loss


Update 4 - Test Accuracy: 0.8996
              precision    recall  f1-score   support

        Fake       0.93      0.96      0.94       705
        Real       0.63      0.49      0.55       102

    accuracy                           0.90       807
   macro avg       0.78      0.72      0.75       807
weighted avg       0.89      0.90      0.89       807


Rolling Evaluation for Update 4 Model:

--- Evaluating on Update 4 ---


Update 4 Accuracy: 0.9532
              precision    recall  f1-score   support

        Fake       0.96      0.98      0.97      1427
        Real       0.89      0.80      0.84       261

    accuracy                           0.95      1688
   macro avg       0.93      0.89      0.91      1688
weighted avg       0.95      0.95      0.95      1688


--- Evaluating on Test ---


Test Accuracy: 0.8996
              precision    recall  f1-score   support

        Fake       0.93      0.96      0.94       705
        Real       0.63      0.49      0.55       102

    accuracy                           0.90       807
   macro avg       0.78      0.72      0.75       807
weighted avg       0.89      0.90      0.89       807



{'Update 4': ({'eval_loss': 0.15085896849632263,
   'eval_model_preparation_time': 0.0027,
   'eval_accuracy': 0.9531990521327014,
   'eval_runtime': 11.4398,
   'eval_samples_per_second': 147.555,
   'eval_steps_per_second': 18.444},
  '              precision    recall  f1-score   support\n\n        Fake       0.96      0.98      0.97      1427\n        Real       0.89      0.80      0.84       261\n\n    accuracy                           0.95      1688\n   macro avg       0.93      0.89      0.91      1688\nweighted avg       0.95      0.95      0.95      1688\n'),
 'Test': ({'eval_loss': 0.302641361951828,
   'eval_model_preparation_time': 0.0028,
   'eval_accuracy': 0.8996282527881041,
   'eval_runtime': 5.5091,
   'eval_samples_per_second': 146.484,
   'eval_steps_per_second': 18.333},
  '              precision    recall  f1-score   support\n\n        Fake       0.93      0.96      0.94       705\n        Real       0.63      0.49      0.55       102\n\n    accuracy            

# RAG Model Experiments

In [None]:
zip_filename = "faiss_indexes_new.zip"
extract_dir = "faiss_indexes"
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted {zip_filename} to {extract_dir}")

import faiss
index_baseline = faiss.read_index(os.path.join(extract_dir, "faiss_index_baseline.index"))
index_update1 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update1.index"))
index_update2 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update2.index"))
index_update3 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update3.index"))
index_update4 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update4.index"))
print("FAISS indexes loaded successfully!")

Extracted faiss_indexes_new.zip to faiss_indexes
FAISS indexes loaded successfully!


In [None]:
with open("faiss_headlines_new.pkl", "rb") as f:
    headlines_data = pickle.load(f)
headlines_baseline = headlines_data["baseline"]
headlines_update1  = headlines_data["update1"]
headlines_update2  = headlines_data["update2"]
headlines_update3  = headlines_data["update3"]
headlines_update4  = headlines_data["update4"]
print("Headlines loaded successfully!")

Headlines loaded successfully!


In [None]:
# RAG Dataset Class and Input Preparation

def prepare_input(article, facts, tokenizer, max_length=512):
    article_tokens = tokenizer.encode(article, add_special_tokens=False)
    fact_tokens_list = [tokenizer.encode(fact, add_special_tokens=False) for fact in facts]
    # Format: [CLS] article [SEP] fact1 [SEP] fact2 [SEP] ...
    input_ids = [tokenizer.cls_token_id] + article_tokens + [tokenizer.sep_token_id]
    token_type_ids = [0] * (len(article_tokens) + 2)
    for fact_tokens in fact_tokens_list:
        input_ids += fact_tokens + [tokenizer.sep_token_id]
        token_type_ids += [1] * (len(fact_tokens) + 1)
    attention_mask = [1] * len(input_ids)
    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
        token_type_ids = token_type_ids[:max_length]
        attention_mask = attention_mask[:max_length]
    else:
        pad_length = max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * pad_length
        token_type_ids += [0] * pad_length
        attention_mask += [0] * pad_length
    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask)
    }

def search_similar_articles(query_headline, model, faiss_index, headlines, k=3):
    query = query_headline.strip().lower()
    query_embedding = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    distances, indices = faiss_index.search(query_embedding, k)
    results = []
    for rank, idx in enumerate(indices[0]):
        if idx == -1:
            continue
        # Combine the retrieved headline with its distance value
        hybrid_fact = f"{headlines[idx]} (dist: {distances[0][rank]:.4f})"
        results.append(hybrid_fact)
    return results

class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, retrieval_model, faiss_index, headlines, max_length=512):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.retrieval_model = retrieval_model
        self.faiss_index = faiss_index
        self.headlines = headlines

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        article = row['title']
        label = row['label']
        # Retrieve top 3 facts
        facts = search_similar_articles(article, self.retrieval_model, self.faiss_index, self.headlines, k=3)
        encoding = prepare_input(article, facts, self.tokenizer, self.max_length)
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        return encoding

In [None]:
# RAG Training & Evaluation Functions

rag_training_args = TrainingArguments(
    output_dir="./rag_results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./rag_logs",
    logging_steps=250,
    report_to="none",
)

def compute_metrics_rag(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None)
    report = classification_report(labels, predictions, output_dict=True)
    conf_matrix = confusion_matrix(labels, predictions)
    print("\nClassification Report:")
    print(classification_report(labels, predictions, digits=4))
    print("Confusion Matrix:")
    print(conf_matrix)
    return {
        "accuracy": acc,
        "f1_per_class": f1.tolist(),
        "precision_per_class": precision.tolist(),
        "recall_per_class": recall.tolist(),
        "confusion_matrix": conf_matrix.tolist(),
        "report": report,
    }

def train_rag_model(model, train_df, retrieval_model, faiss_index, headlines, period_name, eval_dataset):
    print(f"\nTraining RAG model for {period_name}...")
    train_dataset = FakeNewsDataset(train_df, tokenizer, retrieval_model, faiss_index, headlines, max_length=512)
    trainer = Trainer(
        model=model,
        args=rag_training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_rag,
    )
    trainer.train()
    results = trainer.evaluate()
    print(f"{period_name} RAG Model Test Accuracy after training: {results['eval_accuracy']:.4f}")
    return model

def evaluate_rag_on_period(model, period_name, df_period, retrieval_model, faiss_index, headlines):
    print(f"\nEvaluating RAG model on {period_name}...")
    dataset = FakeNewsDataset(df_period, tokenizer, retrieval_model, faiss_index, headlines, max_length=512)
    trainer = Trainer(
        model=model,
        args=rag_training_args,
        eval_dataset=dataset,
        compute_metrics=compute_metrics_rag,
    )
    results = trainer.evaluate()
    preds = trainer.predict(dataset).predictions
    pred_labels = np.argmax(preds, axis=1)
    true_labels = [item["labels"].item() for item in dataset]
    report = classification_report(true_labels, pred_labels, target_names=["Fake", "Real"])
    print(f"{period_name} RAG Model Accuracy: {results['eval_accuracy']:.4f}")
    print(report)
    return results, report

def rolling_evaluation_RAG(model, model_version, retrieval_model):
    mapping = {
        "Baseline": [("Update 1", update1_df, index_update1, headlines_update1),
                     ("Update 2", update2_df, index_update2, headlines_update2),
                     ("Update 3", update3_df, index_update3, headlines_update3),
                     ("Update 4", update4_df, index_update4, headlines_update4),
                     ("Test", test_df, index_update4, headlines_update4)],
        "Update 1": [("Update 1", update1_df, index_update1, headlines_update1),
                     ("Update 2", update2_df, index_update2, headlines_update2),
                     ("Update 3", update3_df, index_update3, headlines_update3),
                     ("Update 4", update4_df, index_update4, headlines_update4),
                     ("Test", test_df, index_update4, headlines_update4)],
        "Update 2": [("Update 2", update2_df, index_update2, headlines_update2),
                     ("Update 3", update3_df, index_update3, headlines_update3),
                     ("Update 4", update4_df, index_update4, headlines_update4),
                     ("Test", test_df, index_update4, headlines_update4)],
        "Update 3": [("Update 3", update3_df, index_update3, headlines_update3),
                     ("Update 4", update4_df, index_update4, headlines_update4),
                     ("Test", test_df, index_update4, headlines_update4)],
        "Update 4": [("Update 4", update4_df, index_update4, headlines_update4),
                     ("Test", test_df, index_update4, headlines_update4)],
    }
    evaluations = {}
    if model_version in mapping:
        for period_name, df_period, faiss_idx, headlines_period in mapping[model_version]:
            evaluations[period_name] = evaluate_rag_on_period(model, period_name, df_period, retrieval_model, faiss_idx, headlines_period)
    else:
        print(f"No rolling evaluation defined for model version: {model_version}")
    return evaluations

def save_and_zip_model(model, tokenizer, model_dir):
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    print(f"Model and tokenizer saved to {model_dir}")
    shutil.make_archive(model_dir, 'zip', model_dir)
    print(f"Created {model_dir}.zip")


In [None]:
# RAG Experiment Runs

test_dataset_RAG = FakeNewsDataset(test_df, tokenizer, SentenceTransformer("all-MiniLM-L6-v2"), index_update4, headlines_update4, max_length=512)

# Initialize the RAG model (a separate copy from the CL pipeline) and the retrieval model (SentenceTransformer)
model_RAG = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
retrieval_model = SentenceTransformer("all-MiniLM-L6-v2")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train on Baseline (2007-2015) using baseline FAISS info, then evaluate on all future periods
print("Training RAG Baseline Model (2007-2015)...")
model_RAG = train_rag_model(model_RAG, baseline_df, retrieval_model, index_baseline, headlines_baseline, "Baseline (2007-2015)", test_dataset_RAG)
save_and_zip_model(model_RAG, tokenizer, "fine_tuned_bert_baseline_RAG")

Training RAG Baseline Model (2007-2015)...

Training RAG model for Baseline (2007-2015)...


Step,Training Loss
250,0.6792
500,0.6781
750,0.6579
1000,0.6715
1250,0.6375
1500,0.6274
1750,0.596
2000,0.5893
2250,0.579
2500,0.5544



Classification Report:
              precision    recall  f1-score   support

           0     0.9641    0.8383    0.8968       705
           1     0.4124    0.7843    0.5405       102

    accuracy                         0.8315       807
   macro avg     0.6882    0.8113    0.7187       807
weighted avg     0.8944    0.8315    0.8518       807

Confusion Matrix:
[[591 114]
 [ 22  80]]
Baseline (2007-2015) RAG Model Test Accuracy after training: 0.8315
Model and tokenizer saved to fine_tuned_bert_baseline_RAG
Created fine_tuned_bert_baseline_RAG.zip

Rolling Evaluation for RAG Baseline Model:

Evaluating RAG model on Update 1...



Classification Report:
              precision    recall  f1-score   support

           0     0.6974    0.5441    0.6113      1555
           1     0.6100    0.7514    0.6733      1476

    accuracy                         0.6450      3031
   macro avg     0.6537    0.6477    0.6423      3031
weighted avg     0.6549    0.6450    0.6415      3031

Confusion Matrix:
[[ 846  709]
 [ 367 1109]]

Classification Report:
              precision    recall  f1-score   support

           0     0.6974    0.5441    0.6113      1555
           1     0.6100    0.7514    0.6733      1476

    accuracy                         0.6450      3031
   macro avg     0.6537    0.6477    0.6423      3031
weighted avg     0.6549    0.6450    0.6415      3031

Confusion Matrix:
[[ 846  709]
 [ 367 1109]]


TypeError: Cannot index by location index with a non-integer key

In [None]:
print("\nRolling Evaluation for RAG Baseline Model:")
rolling_evaluation_RAG(model_RAG, "Baseline", retrieval_model)


Rolling Evaluation for RAG Baseline Model:

Evaluating RAG model on Update 1...



Classification Report:
              precision    recall  f1-score   support

           0     0.6974    0.5441    0.6113      1555
           1     0.6100    0.7514    0.6733      1476

    accuracy                         0.6450      3031
   macro avg     0.6537    0.6477    0.6423      3031
weighted avg     0.6549    0.6450    0.6415      3031

Confusion Matrix:
[[ 846  709]
 [ 367 1109]]

Classification Report:
              precision    recall  f1-score   support

           0     0.6974    0.5441    0.6113      1555
           1     0.6100    0.7514    0.6733      1476

    accuracy                         0.6450      3031
   macro avg     0.6537    0.6477    0.6423      3031
weighted avg     0.6549    0.6450    0.6415      3031

Confusion Matrix:
[[ 846  709]
 [ 367 1109]]
Update 1 RAG Model Accuracy: 0.6450
              precision    recall  f1-score   support

        Fake       0.70      0.54      0.61      1555
        Real       0.61      0.75      0.67      1476

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.7993    0.6202    0.6985      1759
           1     0.5106    0.7178    0.5967       971

    accuracy                         0.6549      2730
   macro avg     0.6549    0.6690    0.6476      2730
weighted avg     0.6966    0.6549    0.6623      2730

Confusion Matrix:
[[1091  668]
 [ 274  697]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7993    0.6202    0.6985      1759
           1     0.5106    0.7178    0.5967       971

    accuracy                         0.6549      2730
   macro avg     0.6549    0.6690    0.6476      2730
weighted avg     0.6966    0.6549    0.6623      2730

Confusion Matrix:
[[1091  668]
 [ 274  697]]
Update 2 RAG Model Accuracy: 0.6549
              precision    recall  f1-score   support

        Fake       0.80      0.62      0.70      1759
        Real       0.51      0.72      0.60       971

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.8980    0.7325    0.8069      3006
           1     0.3909    0.6736    0.4947       766

    accuracy                         0.7206      3772
   macro avg     0.6445    0.7031    0.6508      3772
weighted avg     0.7951    0.7206    0.7435      3772

Confusion Matrix:
[[2202  804]
 [ 250  516]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8980    0.7325    0.8069      3006
           1     0.3909    0.6736    0.4947       766

    accuracy                         0.7206      3772
   macro avg     0.6445    0.7031    0.6508      3772
weighted avg     0.7951    0.7206    0.7435      3772

Confusion Matrix:
[[2202  804]
 [ 250  516]]
Update 3 RAG Model Accuracy: 0.7206
              precision    recall  f1-score   support

        Fake       0.90      0.73      0.81      3006
        Real       0.39      0.67      0.49       766

    accu

Token indices sequence length is longer than the specified maximum sequence length for this model (758 > 512). Running this sequence through the model will result in indexing errors



Classification Report:
              precision    recall  f1-score   support

           0     0.9274    0.7610    0.8360      1427
           1     0.3404    0.6743    0.4524       261

    accuracy                         0.7476      1688
   macro avg     0.6339    0.7177    0.6442      1688
weighted avg     0.8367    0.7476    0.7767      1688

Confusion Matrix:
[[1086  341]
 [  85  176]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9274    0.7610    0.8360      1427
           1     0.3404    0.6743    0.4524       261

    accuracy                         0.7476      1688
   macro avg     0.6339    0.7177    0.6442      1688
weighted avg     0.8367    0.7476    0.7767      1688

Confusion Matrix:
[[1086  341]
 [  85  176]]
Update 4 RAG Model Accuracy: 0.7476
              precision    recall  f1-score   support

        Fake       0.93      0.76      0.84      1427
        Real       0.34      0.67      0.45       261

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9641    0.8383    0.8968       705
           1     0.4124    0.7843    0.5405       102

    accuracy                         0.8315       807
   macro avg     0.6882    0.8113    0.7187       807
weighted avg     0.8944    0.8315    0.8518       807

Confusion Matrix:
[[591 114]
 [ 22  80]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9641    0.8383    0.8968       705
           1     0.4124    0.7843    0.5405       102

    accuracy                         0.8315       807
   macro avg     0.6882    0.8113    0.7187       807
weighted avg     0.8944    0.8315    0.8518       807

Confusion Matrix:
[[591 114]
 [ 22  80]]
Test RAG Model Accuracy: 0.8315
              precision    recall  f1-score   support

        Fake       0.96      0.84      0.90       705
        Real       0.41      0.78      0.54       102

    accuracy        

{'Update 1': ({'eval_loss': 0.6616322994232178,
   'eval_model_preparation_time': 0.0027,
   'eval_accuracy': 0.6450016496205873,
   'eval_f1_per_class': [0.611271676300578, 0.6733454766241651],
   'eval_precision_per_class': [0.6974443528441879, 0.61001100110011],
   'eval_recall_per_class': [0.5440514469453376, 0.7513550135501355],
   'eval_confusion_matrix': [[846, 709], [367, 1109]],
   'eval_report': {'0': {'precision': 0.6974443528441879,
     'recall': 0.5440514469453376,
     'f1-score': 0.611271676300578,
     'support': 1555.0},
    '1': {'precision': 0.61001100110011,
     'recall': 0.7513550135501355,
     'f1-score': 0.6733454766241651,
     'support': 1476.0},
    'accuracy': 0.6450016496205873,
    'macro avg': {'precision': 0.653727676972149,
     'recall': 0.6477032302477366,
     'f1-score': 0.6423085764623715,
     'support': 3031.0},
    'weighted avg': {'precision': 0.6548671086428487,
     'recall': 0.6450016496205873,
     'f1-score': 0.6414996305327175,
     'su

In [None]:
# Update 1 (2016-2017)
print("\nTraining RAG Update 1 Model (2016-2017)...")
model_RAG = train_rag_model(model_RAG, update1_df, retrieval_model, index_update1, headlines_update1, "Update 1 (2016-2017)", test_dataset_RAG)
save_and_zip_model(model_RAG, tokenizer, "fine_tuned_bert_update_1_RAG")
print("\nRolling Evaluation for RAG Update 1 Model:")
rolling_evaluation_RAG(model_RAG, "Update 1", retrieval_model)


Training RAG Update 1 Model (2016-2017)...

Training RAG model for Update 1 (2016-2017)...


Step,Training Loss
250,0.6317
500,0.5311
750,0.4714



Classification Report:
              precision    recall  f1-score   support

           0     0.9587    0.8227    0.8855       705
           1     0.3812    0.7549    0.5066       102

    accuracy                         0.8141       807
   macro avg     0.6699    0.7888    0.6960       807
weighted avg     0.8857    0.8141    0.8376       807

Confusion Matrix:
[[580 125]
 [ 25  77]]
Update 1 (2016-2017) RAG Model Test Accuracy after training: 0.8141
Model and tokenizer saved to fine_tuned_bert_update_1_RAG
Created fine_tuned_bert_update_1_RAG.zip

Rolling Evaluation for RAG Update 1 Model:

Evaluating RAG model on Update 1...



Classification Report:
              precision    recall  f1-score   support

           0     0.9295    0.7627    0.8379      1555
           1     0.7897    0.9390    0.8579      1476

    accuracy                         0.8486      3031
   macro avg     0.8596    0.8509    0.8479      3031
weighted avg     0.8614    0.8486    0.8476      3031

Confusion Matrix:
[[1186  369]
 [  90 1386]]



Classification Report:
              precision    recall  f1-score   support

           0     0.9295    0.7627    0.8379      1555
           1     0.7897    0.9390    0.8579      1476

    accuracy                         0.8486      3031
   macro avg     0.8596    0.8509    0.8479      3031
weighted avg     0.8614    0.8486    0.8476      3031

Confusion Matrix:
[[1186  369]
 [  90 1386]]
Update 1 RAG Model Accuracy: 0.8486
              precision    recall  f1-score   support

        Fake       0.93      0.76      0.84      1555
        Real       0.79      0.94      0.86      1476

    accuracy                           0.85      3031
   macro avg       0.86      0.85      0.85      3031
weighted avg       0.86      0.85      0.85      3031


Evaluating RAG model on Update 2...



Classification Report:
              precision    recall  f1-score   support

           0     0.8191    0.6333    0.7143      1759
           1     0.5292    0.7467    0.6194       971

    accuracy                         0.6736      2730
   macro avg     0.6742    0.6900    0.6669      2730
weighted avg     0.7160    0.6736    0.6806      2730

Confusion Matrix:
[[1114  645]
 [ 246  725]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8191    0.6333    0.7143      1759
           1     0.5292    0.7467    0.6194       971

    accuracy                         0.6736      2730
   macro avg     0.6742    0.6900    0.6669      2730
weighted avg     0.7160    0.6736    0.6806      2730

Confusion Matrix:
[[1114  645]
 [ 246  725]]
Update 2 RAG Model Accuracy: 0.6736
              precision    recall  f1-score   support

        Fake       0.82      0.63      0.71      1759
        Real       0.53      0.75      0.62       971

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9008    0.7552    0.8216      3006
           1     0.4121    0.6736    0.5114       766

    accuracy                         0.7386      3772
   macro avg     0.6565    0.7144    0.6665      3772
weighted avg     0.8016    0.7386    0.7586      3772

Confusion Matrix:
[[2270  736]
 [ 250  516]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9008    0.7552    0.8216      3006
           1     0.4121    0.6736    0.5114       766

    accuracy                         0.7386      3772
   macro avg     0.6565    0.7144    0.6665      3772
weighted avg     0.8016    0.7386    0.7586      3772

Confusion Matrix:
[[2270  736]
 [ 250  516]]
Update 3 RAG Model Accuracy: 0.7386
              precision    recall  f1-score   support

        Fake       0.90      0.76      0.82      3006
        Real       0.41      0.67      0.51       766

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9406    0.7540    0.8370      1427
           1     0.3548    0.7395    0.4795       261

    accuracy                         0.7518      1688
   macro avg     0.6477    0.7467    0.6583      1688
weighted avg     0.8500    0.7518    0.7817      1688

Confusion Matrix:
[[1076  351]
 [  68  193]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9406    0.7540    0.8370      1427
           1     0.3548    0.7395    0.4795       261

    accuracy                         0.7518      1688
   macro avg     0.6477    0.7467    0.6583      1688
weighted avg     0.8500    0.7518    0.7817      1688

Confusion Matrix:
[[1076  351]
 [  68  193]]
Update 4 RAG Model Accuracy: 0.7518
              precision    recall  f1-score   support

        Fake       0.94      0.75      0.84      1427
        Real       0.35      0.74      0.48       261

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9587    0.8227    0.8855       705
           1     0.3812    0.7549    0.5066       102

    accuracy                         0.8141       807
   macro avg     0.6699    0.7888    0.6960       807
weighted avg     0.8857    0.8141    0.8376       807

Confusion Matrix:
[[580 125]
 [ 25  77]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9587    0.8227    0.8855       705
           1     0.3812    0.7549    0.5066       102

    accuracy                         0.8141       807
   macro avg     0.6699    0.7888    0.6960       807
weighted avg     0.8857    0.8141    0.8376       807

Confusion Matrix:
[[580 125]
 [ 25  77]]
Test RAG Model Accuracy: 0.8141
              precision    recall  f1-score   support

        Fake       0.96      0.82      0.89       705
        Real       0.38      0.75      0.51       102

    accuracy        

{'Update 1': ({'eval_loss': 0.3466470241546631,
   'eval_model_preparation_time': 0.0031,
   'eval_accuracy': 0.8485648300890795,
   'eval_f1_per_class': [0.8378664782762275, 0.8579387186629527],
   'eval_precision_per_class': [0.9294670846394985, 0.7897435897435897],
   'eval_recall_per_class': [0.7627009646302251, 0.9390243902439024],
   'eval_confusion_matrix': [[1186, 369], [90, 1386]],
   'eval_report': {'0': {'precision': 0.9294670846394985,
     'recall': 0.7627009646302251,
     'f1-score': 0.8378664782762275,
     'support': 1555.0},
    '1': {'precision': 0.7897435897435897,
     'recall': 0.9390243902439024,
     'f1-score': 0.8579387186629527,
     'support': 1476.0},
    'accuracy': 0.8485648300890795,
    'macro avg': {'precision': 0.8596053371915441,
     'recall': 0.8508626774370638,
     'f1-score': 0.84790259846959,
     'support': 3031.0},
    'weighted avg': {'precision': 0.8614262141458128,
     'recall': 0.8485648300890795,
     'f1-score': 0.8476410169798918,
   

In [None]:
# Update 2 (2018-2019)
print("\nTraining RAG Update 2 Model (2018-2019)...")
model_RAG = train_rag_model(model_RAG, update2_df, retrieval_model, index_update2, headlines_update2, "Update 2 (2018-2019)", test_dataset_RAG)
save_and_zip_model(model_RAG, tokenizer, "fine_tuned_bert_update_2_RAG")
print("\nRolling Evaluation for RAG Update 2 Model:")
rolling_evaluation_RAG(model_RAG, "Update 2", retrieval_model)


Training RAG Update 2 Model (2018-2019)...

Training RAG model for Update 2 (2018-2019)...


Step,Training Loss
250,0.5716
500,0.4662



Classification Report:
              precision    recall  f1-score   support

           0     0.9476    0.9234    0.9353       705
           1     0.5500    0.6471    0.5946       102

    accuracy                         0.8885       807
   macro avg     0.7488    0.7852    0.7650       807
weighted avg     0.8973    0.8885    0.8923       807

Confusion Matrix:
[[651  54]
 [ 36  66]]
Update 2 (2018-2019) RAG Model Test Accuracy after training: 0.8885
Model and tokenizer saved to fine_tuned_bert_update_2_RAG
Created fine_tuned_bert_update_2_RAG.zip

Rolling Evaluation for RAG Update 2 Model:

Evaluating RAG model on Update 2...



Classification Report:
              precision    recall  f1-score   support

           0     0.9313    0.8783    0.9040      1759
           1     0.8002    0.8826    0.8394       971

    accuracy                         0.8799      2730
   macro avg     0.8657    0.8805    0.8717      2730
weighted avg     0.8847    0.8799    0.8810      2730

Confusion Matrix:
[[1545  214]
 [ 114  857]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9313    0.8783    0.9040      1759
           1     0.8002    0.8826    0.8394       971

    accuracy                         0.8799      2730
   macro avg     0.8657    0.8805    0.8717      2730
weighted avg     0.8847    0.8799    0.8810      2730

Confusion Matrix:
[[1545  214]
 [ 114  857]]
Update 2 RAG Model Accuracy: 0.8799
              precision    recall  f1-score   support

        Fake       0.93      0.88      0.90      1759
        Real       0.80      0.88      0.84       971

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.8772    0.8599    0.8685      3006
           1     0.4897    0.5274    0.5079       766

    accuracy                         0.7924      3772
   macro avg     0.6834    0.6937    0.6882      3772
weighted avg     0.7985    0.7924    0.7952      3772

Confusion Matrix:
[[2585  421]
 [ 362  404]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8772    0.8599    0.8685      3006
           1     0.4897    0.5274    0.5079       766

    accuracy                         0.7924      3772
   macro avg     0.6834    0.6937    0.6882      3772
weighted avg     0.7985    0.7924    0.7952      3772

Confusion Matrix:
[[2585  421]
 [ 362  404]]
Update 3 RAG Model Accuracy: 0.7924
              precision    recall  f1-score   support

        Fake       0.88      0.86      0.87      3006
        Real       0.49      0.53      0.51       766

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9230    0.8655    0.8933      1427
           1     0.4514    0.6054    0.5172       261

    accuracy                         0.8252      1688
   macro avg     0.6872    0.7354    0.7052      1688
weighted avg     0.8501    0.8252    0.8352      1688

Confusion Matrix:
[[1235  192]
 [ 103  158]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9230    0.8655    0.8933      1427
           1     0.4514    0.6054    0.5172       261

    accuracy                         0.8252      1688
   macro avg     0.6872    0.7354    0.7052      1688
weighted avg     0.8501    0.8252    0.8352      1688

Confusion Matrix:
[[1235  192]
 [ 103  158]]
Update 4 RAG Model Accuracy: 0.8252
              precision    recall  f1-score   support

        Fake       0.92      0.87      0.89      1427
        Real       0.45      0.61      0.52       261

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9476    0.9234    0.9353       705
           1     0.5500    0.6471    0.5946       102

    accuracy                         0.8885       807
   macro avg     0.7488    0.7852    0.7650       807
weighted avg     0.8973    0.8885    0.8923       807

Confusion Matrix:
[[651  54]
 [ 36  66]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9476    0.9234    0.9353       705
           1     0.5500    0.6471    0.5946       102

    accuracy                         0.8885       807
   macro avg     0.7488    0.7852    0.7650       807
weighted avg     0.8973    0.8885    0.8923       807

Confusion Matrix:
[[651  54]
 [ 36  66]]
Test RAG Model Accuracy: 0.8885
              precision    recall  f1-score   support

        Fake       0.95      0.92      0.94       705
        Real       0.55      0.65      0.59       102

    accuracy        

{'Update 2': ({'eval_loss': 0.2869799733161926,
   'eval_model_preparation_time': 0.0027,
   'eval_accuracy': 0.8798534798534798,
   'eval_f1_per_class': [0.9040374488004681, 0.8393731635651323],
   'eval_precision_per_class': [0.9312839059674503, 0.800186741363212],
   'eval_recall_per_class': [0.8783399658897101, 0.88259526261586],
   'eval_confusion_matrix': [[1545, 214], [114, 857]],
   'eval_report': {'0': {'precision': 0.9312839059674503,
     'recall': 0.8783399658897101,
     'f1-score': 0.9040374488004681,
     'support': 1759.0},
    '1': {'precision': 0.800186741363212,
     'recall': 0.88259526261586,
     'f1-score': 0.8393731635651323,
     'support': 971.0},
    'accuracy': 0.8798534798534798,
    'macro avg': {'precision': 0.8657353236653311,
     'recall': 0.880467614252785,
     'f1-score': 0.8717053061828002,
     'support': 2730.0},
    'weighted avg': {'precision': 0.884655573795027,
     'recall': 0.8798534798534798,
     'f1-score': 0.8810378074218926,
     'supp

In [48]:
# Update 3 (2020-2021)
print("\nTraining RAG Update 3 Model (2020-2021)...")
model_RAG = train_rag_model(model_RAG, update3_df, retrieval_model, index_update3, headlines_update3, "Update 3 (2020-2021)", test_dataset_RAG)
save_and_zip_model(model_RAG, tokenizer, "fine_tuned_bert_update_3_RAG")
print("\nRolling Evaluation for RAG Update 3 Model:")
rolling_evaluation_RAG(model_RAG, "Update 3", retrieval_model)


Training RAG Update 3 Model (2020-2021)...

Training RAG model for Update 3 (2020-2021)...


Step,Training Loss
250,0.4293
500,0.4023


Step,Training Loss
250,0.4293
500,0.4023
750,0.2883



Classification Report:
              precision    recall  f1-score   support

           0     0.9425    0.9532    0.9478       705
           1     0.6489    0.5980    0.6224       102

    accuracy                         0.9083       807
   macro avg     0.7957    0.7756    0.7851       807
weighted avg     0.9054    0.9083    0.9067       807

Confusion Matrix:
[[672  33]
 [ 41  61]]
Update 3 (2020-2021) RAG Model Test Accuracy after training: 0.9083
Model and tokenizer saved to fine_tuned_bert_update_3_RAG
Created fine_tuned_bert_update_3_RAG.zip

Rolling Evaluation for RAG Update 3 Model:

Evaluating RAG model on Update 3...



Classification Report:
              precision    recall  f1-score   support

           0     0.9537    0.9731    0.9633      3006
           1     0.8851    0.8146    0.8484       766

    accuracy                         0.9409      3772
   macro avg     0.9194    0.8938    0.9058      3772
weighted avg     0.9398    0.9409    0.9400      3772

Confusion Matrix:
[[2925   81]
 [ 142  624]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9537    0.9731    0.9633      3006
           1     0.8851    0.8146    0.8484       766

    accuracy                         0.9409      3772
   macro avg     0.9194    0.8938    0.9058      3772
weighted avg     0.9398    0.9409    0.9400      3772

Confusion Matrix:
[[2925   81]
 [ 142  624]]
Update 3 RAG Model Accuracy: 0.9409
              precision    recall  f1-score   support

        Fake       0.95      0.97      0.96      3006
        Real       0.89      0.81      0.85       766

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9170    0.8984    0.9076      1427
           1     0.5000    0.5556    0.5263       261

    accuracy                         0.8454      1688
   macro avg     0.7085    0.7270    0.7170      1688
weighted avg     0.8525    0.8454    0.8487      1688

Confusion Matrix:
[[1282  145]
 [ 116  145]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9170    0.8984    0.9076      1427
           1     0.5000    0.5556    0.5263       261

    accuracy                         0.8454      1688
   macro avg     0.7085    0.7270    0.7170      1688
weighted avg     0.8525    0.8454    0.8487      1688

Confusion Matrix:
[[1282  145]
 [ 116  145]]
Update 4 RAG Model Accuracy: 0.8454
              precision    recall  f1-score   support

        Fake       0.92      0.90      0.91      1427
        Real       0.50      0.56      0.53       261

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9425    0.9532    0.9478       705
           1     0.6489    0.5980    0.6224       102

    accuracy                         0.9083       807
   macro avg     0.7957    0.7756    0.7851       807
weighted avg     0.9054    0.9083    0.9067       807

Confusion Matrix:
[[672  33]
 [ 41  61]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9425    0.9532    0.9478       705
           1     0.6489    0.5980    0.6224       102

    accuracy                         0.9083       807
   macro avg     0.7957    0.7756    0.7851       807
weighted avg     0.9054    0.9083    0.9067       807

Confusion Matrix:
[[672  33]
 [ 41  61]]
Test RAG Model Accuracy: 0.9083
              precision    recall  f1-score   support

        Fake       0.94      0.95      0.95       705
        Real       0.65      0.60      0.62       102

    accuracy        

{'Update 3': ({'eval_loss': 0.1904534548521042,
   'eval_model_preparation_time': 0.0028,
   'eval_accuracy': 0.940880169671262,
   'eval_f1_per_class': [0.9632800922114276, 0.8484024473147519],
   'eval_precision_per_class': [0.9537006847081839, 0.8851063829787233],
   'eval_recall_per_class': [0.9730538922155688, 0.814621409921671],
   'eval_confusion_matrix': [[2925, 81], [142, 624]],
   'eval_report': {'0': {'precision': 0.9537006847081839,
     'recall': 0.9730538922155688,
     'f1-score': 0.9632800922114276,
     'support': 3006.0},
    '1': {'precision': 0.8851063829787233,
     'recall': 0.814621409921671,
     'f1-score': 0.8484024473147519,
     'support': 766.0},
    'accuracy': 0.940880169671262,
    'macro avg': {'precision': 0.9194035338434536,
     'recall': 0.8938376510686199,
     'f1-score': 0.9058412697630898,
     'support': 3772.0},
    'weighted avg': {'precision': 0.9397708768808332,
     'recall': 0.940880169671262,
     'f1-score': 0.9399512809731313,
     'su

In [49]:
# Update 4 (2022)
print("\nTraining RAG Update 4 Model (2022)...")
model_RAG = train_rag_model(model_RAG, update4_df, retrieval_model, index_update4, headlines_update4, "Update 4 (2022)", test_dataset_RAG)
save_and_zip_model(model_RAG, tokenizer, "fine_tuned_bert_update_4_RAG")
print("\nRolling Evaluation for RAG Update 4 Model:")
rolling_evaluation_RAG(model_RAG, "Update 4", retrieval_model)


Training RAG Update 4 Model (2022)...

Training RAG model for Update 4 (2022)...


Step,Training Loss
250,0.3333



Classification Report:
              precision    recall  f1-score   support

           0     0.9291    0.9660    0.9471       705
           1     0.6757    0.4902    0.5682       102

    accuracy                         0.9058       807
   macro avg     0.8024    0.7281    0.7577       807
weighted avg     0.8970    0.9058    0.8992       807

Confusion Matrix:
[[681  24]
 [ 52  50]]
Update 4 (2022) RAG Model Test Accuracy after training: 0.9058
Model and tokenizer saved to fine_tuned_bert_update_4_RAG
Created fine_tuned_bert_update_4_RAG.zip

Rolling Evaluation for RAG Update 4 Model:

Evaluating RAG model on Update 4...



Classification Report:
              precision    recall  f1-score   support

           0     0.9676    0.9832    0.9753      1427
           1     0.8992    0.8199    0.8577       261

    accuracy                         0.9579      1688
   macro avg     0.9334    0.9016    0.9165      1688
weighted avg     0.9570    0.9579    0.9571      1688

Confusion Matrix:
[[1403   24]
 [  47  214]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9676    0.9832    0.9753      1427
           1     0.8992    0.8199    0.8577       261

    accuracy                         0.9579      1688
   macro avg     0.9334    0.9016    0.9165      1688
weighted avg     0.9570    0.9579    0.9571      1688

Confusion Matrix:
[[1403   24]
 [  47  214]]
Update 4 RAG Model Accuracy: 0.9579
              precision    recall  f1-score   support

        Fake       0.97      0.98      0.98      1427
        Real       0.90      0.82      0.86       261

    accu


Classification Report:
              precision    recall  f1-score   support

           0     0.9291    0.9660    0.9471       705
           1     0.6757    0.4902    0.5682       102

    accuracy                         0.9058       807
   macro avg     0.8024    0.7281    0.7577       807
weighted avg     0.8970    0.9058    0.8992       807

Confusion Matrix:
[[681  24]
 [ 52  50]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9291    0.9660    0.9471       705
           1     0.6757    0.4902    0.5682       102

    accuracy                         0.9058       807
   macro avg     0.8024    0.7281    0.7577       807
weighted avg     0.8970    0.9058    0.8992       807

Confusion Matrix:
[[681  24]
 [ 52  50]]
Test RAG Model Accuracy: 0.9058
              precision    recall  f1-score   support

        Fake       0.93      0.97      0.95       705
        Real       0.68      0.49      0.57       102

    accuracy        

{'Update 4': ({'eval_loss': 0.13088315725326538,
   'eval_model_preparation_time': 0.0029,
   'eval_accuracy': 0.9579383886255924,
   'eval_f1_per_class': [0.9753215154675009, 0.8577154308617234],
   'eval_precision_per_class': [0.9675862068965517, 0.8991596638655462],
   'eval_recall_per_class': [0.9831814996496145, 0.8199233716475096],
   'eval_confusion_matrix': [[1403, 24], [47, 214]],
   'eval_report': {'0': {'precision': 0.9675862068965517,
     'recall': 0.9831814996496145,
     'f1-score': 0.9753215154675009,
     'support': 1427.0},
    '1': {'precision': 0.8991596638655462,
     'recall': 0.8199233716475096,
     'f1-score': 0.8577154308617234,
     'support': 261.0},
    'accuracy': 0.9579383886255924,
    'macro avg': {'precision': 0.9333729353810489,
     'recall': 0.901552435648562,
     'f1-score': 0.9165184731646121,
     'support': 1688.0},
    'weighted avg': {'precision': 0.9570060364397434,
     'recall': 0.9579383886255924,
     'f1-score': 0.95713716233829,
     '