In [27]:
#imports
!pip install transformers datasets torch scikit-learn pandas matplotlib faiss-cpu sentence-transformers
import os
import json
import faiss
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer

np.random.seed(42)



In [28]:
df = pd.read_csv("Liar2_combined.csv", header = 0)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df = df.dropna(subset=['date'])

print(df.head())


   label                                              title       date
0      1  90 percent of Americans "support universal bac... 2017-10-02
1      0  Last year was one of the deadliest years ever ... 2017-05-19
2      0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
3      1  Voter ID is supported by an overwhelming major... 2021-12-08
4      0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12


In [29]:
#Defining our Date Ranges
baseline_start, baseline_end = '2007-01-01', '2015-12-31'
update1_start, update1_end   = '2016-01-01', '2017-12-31'
update2_start, update2_end   = '2018-01-01', '2019-12-31'
update3_start, update3_end   = '2020-01-01', '2021-12-31'
update4_start, update4_end   = '2022-01-01', '2022-12-31'
test_start, test_end         = '2023-01-01', '2023-12-31'

# Baseline training set: entries w/ date <= split_date
baseline_df = df[(df['date'] >= baseline_start) & (df['date'] <= baseline_end)].copy()
update1_df = df[(df['date'] >= update1_start) & (df['date'] <= update1_end)].copy()
update2_df = df[(df['date'] >= update2_start) & (df['date'] <= update2_end)].copy()
update3_df = df[(df['date'] >= update3_start) & (df['date'] <= update3_end)].copy()
update4_df = df[(df['date'] >= update4_start) & (df['date'] <= update4_end)].copy()
test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)].copy()

# Display sample sizes for each block
print("Baseline samples:", len(baseline_df))
print("Update 1 samples:", len(update1_df))
print("Update 2 samples:", len(update2_df))
print("Update 3 samples:", len(update3_df))
print("Update 4 samples:", len(update4_df))
print("Test samples:", len(test_df))

Baseline samples: 10932
Update 1 samples: 3031
Update 2 samples: 2730
Update 3 samples: 3772
Update 4 samples: 1688
Test samples: 807


In [30]:
print("Baseline distribution:")
print(baseline_df['label'].value_counts())

Baseline distribution:
label
1    6147
0    4785
Name: count, dtype: int64


#Faiss Index Creations


In [None]:
real_articles_file = "News_Category_Dataset_v3.json"

real_articles = []
with open(real_articles_file, 'r') as f:
    for line in f:
        try:
            art = json.loads(line)
            art_date = None
            if 'date' in art:
                try:
                    art_date = datetime.strptime(art['date'], '%Y-%m-%d')
                except Exception as e:
                    print(f"Error parsing date for article: {art.get('date')}, {e}")
            art['parsed_date'] = art_date
            real_articles.append(art)
        except Exception as e:
            print("Error parsing line:", e)

# Helper function to filter articles up to a given end_date
def filter_articles_by_date(articles, end_date_str):
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    return [art for art in articles if art['parsed_date'] is not None and art['parsed_date'] <= end_date]

In [None]:
#Get Embeddings

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to build a FAISS index given a list of articles
def build_faiss_index(articles):
    headlines = [art['headline'].strip().lower() for art in articles]
    embeddings = embedding_model.encode(headlines, convert_to_numpy=True)
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index, headlines

# Build cumulative FAISS indexes for each time period:
# Baseline (up to 2015-12-31)
articles_baseline = filter_articles_by_date(real_articles, baseline_end)
index_baseline, headlines_baseline = build_faiss_index(articles_baseline)
print("Baseline FAISS index built with", len(articles_baseline), "articles.")

# Update 1
articles_update1 = filter_articles_by_date(real_articles, update1_end)
index_update1, headlines_update1 = build_faiss_index(articles_update1)
print("Update 1 FAISS index built with", len(articles_update1), "articles.")

# Update 2
articles_update2 = filter_articles_by_date(real_articles, update2_end)
index_update2, headlines_update2 = build_faiss_index(articles_update2)
print("Update 2 FAISS index built with", len(articles_update2), "articles.")

# Update 3
articles_update3 = filter_articles_by_date(real_articles, update3_end)
index_update3, headlines_update3 = build_faiss_index(articles_update3)
print("Update 3 FAISS index built with", len(articles_update3), "articles.")

# Update 4
articles_update4 = filter_articles_by_date(real_articles, update4_end)
index_update4, headlines_update4 = build_faiss_index(articles_update4)
print("Update 4 FAISS index built with", len(articles_update4), "articles.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Baseline FAISS index built with 130283 articles.
Update 1 FAISS index built with 192270 articles.
Update 2 FAISS index built with 204009 articles.
Update 3 FAISS index built with 208129 articles.
Update 4 FAISS index built with 209527 articles.


In [31]:
def search_similar_articles(query_headline, model, faiss_index, headlines, k=3):
    query = query_headline.strip().lower()
    query_embedding = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    distances, indices = faiss_index.search(query_embedding, k)
    results = []
    for rank, idx in enumerate(indices[0]):
        if idx == -1:
            continue
        # Combine the retrieved headline with its distance value
        hybrid_fact = f"{headlines[idx]} (dist: {distances[0][rank]:.4f})"
        results.append(hybrid_fact)
    return results


In [None]:
test_query = "Over 4 million Americans get Omicron boosters"
results = search_similar_articles(test_query, embedding_model, index_update4, headlines_update4, k=3)

print(results)


['over 4 million americans roll up sleeves for omicron-targeted covid boosters (dist: 0.7471)', 'u.s. added 678,000 jobs in february as omicron eases (dist: 0.5421)', 'more countries scramble to curb omicron (dist: 0.5338)']


In [None]:
# Save each FAISS index to disk
faiss.write_index(index_baseline, "faiss_index_baseline.index")
faiss.write_index(index_update1, "faiss_index_update1.index")
faiss.write_index(index_update2, "faiss_index_update2.index")
faiss.write_index(index_update3, "faiss_index_update3.index")
faiss.write_index(index_update4, "faiss_index_update4.index")

# Compress the index files into a single zip archive
!zip faiss_indexes.zip faiss_index_baseline.index faiss_index_update1.index faiss_index_update2.index faiss_index_update3.index faiss_index_update4.index

  adding: faiss_index_baseline.index (deflated 8%)
  adding: faiss_index_update1.index (deflated 7%)
  adding: faiss_index_update2.index (deflated 7%)
  adding: faiss_index_update3.index (deflated 7%)
  adding: faiss_index_update4.index (deflated 7%)


In [None]:
import pickle

# To save the headlines:
headlines_data = {
    "baseline": headlines_baseline,
    "update1": headlines_update1,
    "update2": headlines_update2,
    "update3": headlines_update3,
    "update4": headlines_update4,
}

with open("faiss_headlines.pkl", "wb") as f:
    pickle.dump(headlines_data, f)

print("Headlines saved to faiss_headlines.pkl")


Headlines saved to faiss_headlines.pkl


#Load Faiss Indexes


In [5]:
import zipfile
import os
from google.colab import files
import faiss

zip_filename = "faiss_indexes.zip"
extract_dir = "faiss_indexes"
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted {zip_filename} to {extract_dir}")

# Load the FAISS indexes from the extracted folder.
index_baseline = faiss.read_index(os.path.join(extract_dir, "faiss_index_baseline.index"))
index_update1 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update1.index"))
index_update2 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update2.index"))
index_update3 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update3.index"))
index_update4 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update4.index"))

print("FAISS indexes loaded successfully!")


Extracted faiss_indexes.zip to faiss_indexes
FAISS indexes loaded successfully!


In [6]:
import pickle

with open("faiss_headlines.pkl", "rb") as f:
    headlines_data = pickle.load(f)

headlines_baseline = headlines_data["baseline"]
headlines_update1  = headlines_data["update1"]
headlines_update2  = headlines_data["update2"]
headlines_update3  = headlines_data["update3"]
headlines_update4  = headlines_data["update4"]

print("Headlines loaded successfully!")


Headlines loaded successfully!


#RAG Models


In [32]:
def prepare_input(article, facts, tokenizer, max_length=512):

    article_tokens = tokenizer.encode(article, add_special_tokens=False)

    fact_tokens_list = [tokenizer.encode(fact, add_special_tokens=False) for fact in facts]

    # Start with [CLS], then article, then [SEP]
    input_ids = [tokenizer.cls_token_id] + article_tokens + [tokenizer.sep_token_id]
    token_type_ids = [0] * (len(article_tokens) + 2) # 0 for article, 1 for facts

    for fact_tokens in fact_tokens_list:
        input_ids += fact_tokens + [tokenizer.sep_token_id]
        token_type_ids += [1] * (len(fact_tokens) + 1)
    # Format: [CLS] article [SEP] fact1 [SEP] fact 2 [SEP] ...

    attention_mask = [1] * len(input_ids)

    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
        token_type_ids = token_type_ids[:max_length]
        attention_mask = attention_mask[:max_length]
    else:
        pad_length = max_length - len(input_ids)
        input_ids = input_ids + [tokenizer.pad_token_id] * pad_length
        token_type_ids = token_type_ids + [0] * pad_length
        attention_mask = attention_mask + [0] * pad_length

    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask)
    }


In [33]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, retrieval_model, faiss_index, headlines, max_length=512):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.retrieval_model = retrieval_model
        self.faiss_index = faiss_index
        self.headlines = headlines

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        article = row['title']
        label = row['label']
        # Retrieve the top 3 hybrid facts (headline with distance)
        facts = search_similar_articles(article, self.retrieval_model, self.faiss_index, self.headlines, k=3)
        encoding = prepare_input(article, facts, self.tokenizer, self.max_length)
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        return encoding

In [34]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
rag_training_args = TrainingArguments(
    output_dir="./rag_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./rag_logs",
    logging_steps=10,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [41]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

def train_rag_model(model, train_df, retrieval_model, faiss_index, headlines, test_dataset, period_name):
    print(f"\nTraining RAG model for {period_name}...")
    train_dataset = FakeNewsDataset(train_df, tokenizer, retrieval_model, faiss_index, headlines, max_length=512)

    trainer = Trainer(
        model=model,
        args=rag_training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    results = trainer.evaluate()
    print(f"{period_name} RAG Model Test Accuracy after training: {results['eval_accuracy']:.4f}")
    return model

def evaluate_model(model, test_dataset, period_name):
    trainer = Trainer(
        model=model,
        args=rag_training_args,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )
    results = trainer.evaluate()
    print(f"{period_name} RAG Model Test Accuracy: {results['eval_accuracy']:.4f}")

In [37]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [38]:
# Prepare the test dataset using the cumulative FAISS index from Update 4 (up to 2022)
test_dataset_rag = FakeNewsDataset(test_df, tokenizer, embedding_model, index_update4, headlines_update4, max_length=512)

In [39]:
os.environ["WANDB_DISABLED"] = "true"

In [43]:
import shutil
from google.colab import files

def save_and_zip_model(model, tokenizer, model_dir):
    """
    Save the model and tokenizer to the given directory, compress that directory into a zip file,
    and download the zip file.

    Args:
        model: The trained model.
        tokenizer: The tokenizer associated with the model.
        model_dir (str): The directory name to save the model. This will be used as the zip file name as well.
    """
    # Save the model and tokenizer
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    print(f"Model and tokenizer saved to {model_dir}")

    # Zip the directory
    shutil.make_archive(model_dir, 'zip', model_dir)
    print(f"Created {model_dir}.zip")

    # Trigger the download
    files.download(f"{model_dir}.zip")

In [42]:
# Train/evaluate the RAG models separately:

# Baseline (2007-2015)
print("Evaluating Baseline RAG Model (2007-2015)...")
model = train_rag_model(model, baseline_df, embedding_model, index_baseline, headlines_baseline, test_dataset_rag, "Baseline (2007-2015)")


Evaluating Baseline RAG Model (2007-2015)...

Training RAG model for Baseline (2007-2015)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6993,0.61026,0.767038
2,0.6216,0.421618,0.816605
3,0.4091,0.453199,0.80917


Baseline (2007-2015) RAG Model Test Accuracy after training: 0.8092


In [44]:
save_and_zip_model(model, tokenizer, "fine_tuned_bert_baseline_RAG")

Model and tokenizer saved to fine_tuned_bert_baseline_RAG
Created fine_tuned_bert_baseline_RAG.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [45]:
# Update 1 (2016-2017)
print("Evaluating Update 1 RAG Model (2016-2017)...")
model = train_rag_model(model, update1_df, embedding_model, index_update1, headlines_update1, test_dataset_rag, "Update 1 (2016-2017)")

Evaluating Update 1 RAG Model (2016-2017)...

Training RAG model for Update 1 (2016-2017)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6142,0.378794,0.833953
2,0.4601,0.3471,0.841388
3,0.2723,0.555126,0.807931


Update 1 (2016-2017) RAG Model Test Accuracy after training: 0.8079


In [46]:
save_and_zip_model(model, tokenizer, "fine_tuned_bert_update_1_RAG")

Model and tokenizer saved to fine_tuned_bert_update_1_RAG
Created fine_tuned_bert_update_1_RAG.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
# Update 2 (2018-2019)
print("Evaluating Update 2 RAG Model (2018-2019)...")
model = train_rag_model(model, update2_df, embedding_model, index_update2, headlines_update2, test_dataset_rag, "Update 2 (2018-2019)")

Evaluating Update 2 RAG Model (2018-2019)...

Training RAG model for Update 2 (2018-2019)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.618,0.279365,0.899628
2,0.4176,0.309359,0.873606
3,0.2789,0.429683,0.873606


Update 2 (2018-2019) RAG Model Test Accuracy after training: 0.8736


In [48]:
save_and_zip_model(model, tokenizer, "fine_tuned_bert_update_2_RAG")

Model and tokenizer saved to fine_tuned_bert_update_2_RAG
Created fine_tuned_bert_update_2_RAG.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
# Update 3 (2020-2021)
print("Evaluating Update 3 RAG Model (2020-2021)...")
model = train_rag_model(model, update3_df, embedding_model, index_update3, headlines_update3, test_dataset_rag, "Update 3 (2020-2021)")

Evaluating Update 3 RAG Model (2020-2021)...

Training RAG model for Update 3 (2020-2021)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4483,0.265744,0.899628
2,0.2513,0.322785,0.903346
3,0.2423,0.421403,0.900867


Update 3 (2020-2021) RAG Model Test Accuracy after training: 0.9009


In [50]:
save_and_zip_model(model, tokenizer, "fine_tuned_bert_update_3_RAG")

Model and tokenizer saved to fine_tuned_bert_update_3_RAG
Created fine_tuned_bert_update_3_RAG.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [51]:
# Update 4 (2022)
print("Evaluating Update 4 RAG Model (2022)...")
model = train_rag_model(model, update4_df, embedding_model, index_update4, headlines_update4, test_dataset_rag, "Update 4 (2022)")

Evaluating Update 4 RAG Model (2022)...

Training RAG model for Update 4 (2022)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4315,0.244957,0.890954
2,0.285,0.330838,0.899628
3,0.096,0.458483,0.890954


Update 4 (2022) RAG Model Test Accuracy after training: 0.8910


In [52]:
save_and_zip_model(model, tokenizer, "fine_tuned_bert_update_4_RAG")

Model and tokenizer saved to fine_tuned_bert_update_4_RAG
Created fine_tuned_bert_update_4_RAG.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>