In [1]:
!pip install transformers datasets torch scikit-learn pandas matplotlib faiss-cpu sentence-transformers
import os
import json
import faiss
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer

np.random.seed(42)

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_6

In [2]:
df = pd.read_csv("Liar2_combined.csv", header = 0)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df = df.dropna(subset=['date'])

print(df.head())

   label                                              title       date
0      1  90 percent of Americans "support universal bac... 2017-10-02
1      0  Last year was one of the deadliest years ever ... 2017-05-19
2      0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
3      1  Voter ID is supported by an overwhelming major... 2021-12-08
4      0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12


In [3]:

#Defining our Date Ranges
baseline_start, baseline_end = '2007-01-01', '2015-12-31'
update1_start, update1_end   = '2016-01-01', '2017-12-31'
update2_start, update2_end   = '2018-01-01', '2019-12-31'
update3_start, update3_end   = '2020-01-01', '2021-12-31'
update4_start, update4_end   = '2022-01-01', '2022-12-31'
test_start, test_end         = '2023-01-01', '2023-12-31'

# Baseline training set: entries w/ date <= split_date
baseline_df = df[(df['date'] >= baseline_start) & (df['date'] <= baseline_end)].copy()
update1_df = df[(df['date'] >= update1_start) & (df['date'] <= update1_end)].copy()
update2_df = df[(df['date'] >= update2_start) & (df['date'] <= update2_end)].copy()
update3_df = df[(df['date'] >= update3_start) & (df['date'] <= update3_end)].copy()
update4_df = df[(df['date'] >= update4_start) & (df['date'] <= update4_end)].copy()
test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)].copy()

# Display sample sizes for each block
print("Baseline samples:", len(baseline_df))
print("Update 1 samples:", len(update1_df))
print("Update 2 samples:", len(update2_df))
print("Update 3 samples:", len(update3_df))
print("Update 4 samples:", len(update4_df))
print("Test samples:", len(test_df))

print(baseline_df.head())


Baseline samples: 10932
Update 1 samples: 3031
Update 2 samples: 2730
Update 3 samples: 3772
Update 4 samples: 1688
Test samples: 807
    label                                              title       date
2       0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
4       0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12
6       0  Says Jeff Reardon cut elementary school music ... 2012-05-08
11      0  Says PolitiFact "listed Governor Scott Walker ... 2012-06-04
12      1  Guantanamo has "never been a key component of ... 2015-12-27


In [4]:
df_headlines = pd.read_csv("headlines.csv", header=0)

# Convert the Date column (YYYYMMDD) to a datetime object and drop rows where parsing fails.
df_headlines['parsed_date'] = pd.to_datetime(df_headlines['Date'], format='%Y%m%d', errors='coerce')
df_headlines = df_headlines.dropna(subset=['parsed_date'])

print("Columns:", df_headlines.columns.tolist())
print("CSV sample:")
print(df_headlines.head())

Columns: ['Date', 'Publication', 'Headline', 'URL', 'parsed_date']
CSV sample:
       Date     Publication  \
0  20070101  New York Times   
1  20070101  New York Times   
2  20070101  New York Times   
3  20070101  New York Times   
4  20070101  New York Times   

                                            Headline  \
0               Rush to Hang Hussein Was  Questioned   
1  News Analysis: For Sunnis, Dictators End Is O...   
2                            Hard Choices Over Video   
3     States Take Lead on Ethics Rules for Lawmakers   
4  Spitzer Arrives With Mandate, but Faces Challe...   

                                                 URL parsed_date  
0  http://www.nytimes.com/2007/01/01/world/middle...  2007-01-01  
1  http://www.nytimes.com/2007/01/01/world/middle...  2007-01-01  
2  http://www.nytimes.com/2007/01/01/world/middle...  2007-01-01  
3  http://www.nytimes.com/2007/01/01/us/01ethics....  2007-01-01  
4  http://www.nytimes.com/2007/01/01/nyregion/01e...  2007-01-

In [5]:
# Convert the filtered dataframes into a list of articles (dictionaries)
def df_to_articles(df, headline_col="Headline", date_col="parsed_date"):
    articles = []
    for _, row in df.iterrows():
        headline = row[headline_col]
        # Skip if headline is not a string (e.g., NaN)
        if not isinstance(headline, str):
            continue
        articles.append({
            "headline": headline.strip(),
            "parsed_date": row[date_col]
        })
    return articles

In [14]:
baseline_df_sample = df_headlines[df_headlines['parsed_date'] <= baseline_end]
if len(baseline_df_sample) > 200000:
    baseline_df_sample = baseline_df_sample.sample(n=200000, random_state=42)


update1_df_sample = df_headlines[(df_headlines['parsed_date'] > baseline_end) & (df_headlines['parsed_date'] <= update1_end)]
if len(update1_df_sample) > 200000:
    update1_df_sample = update1_df_sample.sample(n=200000, random_state=42)


update2_df_sample = df_headlines[(df_headlines['parsed_date'] > update1_end) & (df_headlines['parsed_date'] <= update2_end)]
if len(update2_df_sample) > 200000:
    update2_df_sample = update2_df_sample.sample(n=200000, random_state=42)


update3_df_sample = df_headlines[(df_headlines['parsed_date'] > update2_end) & (df_headlines['parsed_date'] <= update3_end)]
if len(update3_df_sample) > 200000:
    update3_df_sample = update3_df_sample.sample(n=200000, random_state=42)


update4_df_sample = df_headlines[(df_headlines['parsed_date'] > update3_end) & (df_headlines['parsed_date'] <= update4_end)]
if len(update4_df_sample) > 200000:
    update4_df_sample = update4_df_sample.sample(n=200000, random_state=42)

print(len(baseline_df_sample))
print(len(update1_df_sample))
print(len(update2_df_sample))
print(len(update3_df_sample))
print(len(update4_df_sample))

200000
200000
200000
200000
200000


In [21]:
articles_baseline = df_to_articles(baseline_df_sample)
articles_update1  = df_to_articles(update1_df_sample)
articles_update2  = df_to_articles(update2_df_sample)
articles_update3  = df_to_articles(update3_df_sample)
articles_update4  = df_to_articles(update4_df_sample)

print(len(articles_baseline))
print(len(articles_update1))
print(len(articles_update2))
print(len(articles_update3))
print(len(articles_update4))

197827
198134
198202
198273
198288


In [22]:
print(articles_baseline[1])

{'headline': 'Hillary: How she needs not to beat  herself', 'parsed_date': Timestamp('2015-04-10 00:00:00')}


In [23]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def build_faiss_index(articles):
    headlines = [art['headline'].lower() for art in articles]
    embeddings = embedding_model.encode(headlines, convert_to_numpy=True)
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index, headlines

In [24]:
# Build FAISS indexes for each time period
index_baseline, headlines_baseline = build_faiss_index(articles_baseline)
print("Baseline FAISS index built with", len(articles_baseline), "articles.")

Baseline FAISS index built with 197827 articles.


In [25]:
index_update1, headlines_update1   = build_faiss_index(articles_update1)

print("Update 1 FAISS index built with", len(articles_update1), "articles.")

Update 1 FAISS index built with 198134 articles.


In [None]:
index_update2, headlines_update2   = build_faiss_index(articles_update2)

print("Update 2 FAISS index built with", len(articles_update2), "articles.")

In [None]:
index_update3, headlines_update3   = build_faiss_index(articles_update3)
print("Update 3 FAISS index built with", len(articles_update3), "articles.")

In [None]:
index_update4, headlines_update4   = build_faiss_index(articles_update4)
print("Update 4 FAISS index built with", len(articles_update4), "articles.")

In [29]:
def search_similar_articles(query_headline, model, faiss_index, headlines, k=3):
    query = query_headline.strip().lower()
    query_embedding = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    distances, indices = faiss_index.search(query_embedding, k)
    results = []
    for rank, idx in enumerate(indices[0]):
        if idx == -1:
            continue
        hybrid_fact = f"{headlines[idx]} (dist: {distances[0][rank]:.4f})"
        results.append(hybrid_fact)
    return results

In [30]:
test_query = "Over 4 million Americans get Omicron boosters"
results = search_similar_articles(test_query, embedding_model, index_update4, headlines_update4, k=3)
print("Search results:", results)

Search results: ["new omicron boosters are now available, but it's unclear how effective they will be (dist: 0.7497)", "new omicron boosters are now available, but it's unclear how effective they will be (dist: 0.7497)", 'boosters 90% effective at preventing omicron… (dist: 0.7166)']


In [31]:
# Save each FAISS index to disk
faiss.write_index(index_baseline, "faiss_index_baseline.index")
faiss.write_index(index_update1, "faiss_index_update1.index")
faiss.write_index(index_update2, "faiss_index_update2.index")
faiss.write_index(index_update3, "faiss_index_update3.index")
faiss.write_index(index_update4, "faiss_index_update4.index")

# Compress the index files into a single zip archive
!zip faiss_indexes.zip faiss_index_baseline.index faiss_index_update1.index faiss_index_update2.index faiss_index_update3.index faiss_index_update4.index

  adding: faiss_index_baseline.index (deflated 8%)
  adding: faiss_index_update1.index (deflated 7%)
  adding: faiss_index_update2.index (deflated 8%)
  adding: faiss_index_update3.index (deflated 7%)
  adding: faiss_index_update4.index (deflated 7%)


In [33]:
import pickle

# To save the headlines:
headlines_data = {
    "baseline": headlines_baseline,
    "update1": headlines_update1,
    "update2": headlines_update2,
    "update3": headlines_update3,
    "update4": headlines_update4,
}

with open("faiss_headlines.pkl", "wb") as f:
    pickle.dump(headlines_data, f)

print("Headlines saved to faiss_headlines.pkl")

Headlines saved to faiss_headlines.pkl
