In [None]:
#imports
!pip install transformers datasets torch scikit-learn pandas matplotlib faiss-cpu sentence-transformers
import os
import json
import faiss
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer

np.random.seed(42)

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_6

In [None]:
df = pd.read_csv("Liar2_combined.csv", header = 0)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df = df.dropna(subset=['date'])

print(df.head())


   label                                              title       date
0      1  90 percent of Americans "support universal bac... 2017-10-02
1      0  Last year was one of the deadliest years ever ... 2017-05-19
2      0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
3      1  Voter ID is supported by an overwhelming major... 2021-12-08
4      0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12


In [None]:
df_live = pd.read_csv("balanced_live_dataset.csv", header = 0)

df_live = df_live.rename(columns={'headline': 'title'})
df_live = df_live.rename(columns={'rating': 'label'})

print(df_live.head())

                                               title  label
0  a photograph authentically depicts then-britis...      1
1  social media users are claiming that trump jus...      0
2  videos and still images shared to social media...      1
3  in march 2025, u.s. immigration and customs en...      1
4  social media users are circulating a video cli...      0


In [None]:
print(baseline_df.head())

    label                                              title       date
2       0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
4       0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12
6       0  Says Jeff Reardon cut elementary school music ... 2012-05-08
11      0  Says PolitiFact "listed Governor Scott Walker ... 2012-06-04
12      1  Guantanamo has "never been a key component of ... 2015-12-27


In [None]:
#Defining our Date Ranges
baseline_start, baseline_end = '2007-01-01', '2015-12-31'
update1_start, update1_end   = '2016-01-01', '2017-12-31'
update2_start, update2_end   = '2018-01-01', '2019-12-31'
update3_start, update3_end   = '2020-01-01', '2021-12-31'
update4_start, update4_end   = '2022-01-01', '2022-12-31'
test_start, test_end         = '2023-01-01', '2023-12-31'

# Baseline training set: entries w/ date <= split_date
baseline_df = df[(df['date'] >= baseline_start) & (df['date'] <= baseline_end)].copy()
update1_df = df[(df['date'] >= update1_start) & (df['date'] <= update1_end)].copy()
update2_df = df[(df['date'] >= update2_start) & (df['date'] <= update2_end)].copy()
update3_df = df[(df['date'] >= update3_start) & (df['date'] <= update3_end)].copy()
update4_df = df[(df['date'] >= update4_start) & (df['date'] <= update4_end)].copy()
test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)].copy()

# Display sample sizes for each block
print("Baseline samples:", len(baseline_df))
print("Update 1 samples:", len(update1_df))
print("Update 2 samples:", len(update2_df))
print("Update 3 samples:", len(update3_df))
print("Update 4 samples:", len(update4_df))
print("Test samples:", len(test_df))

Baseline samples: 10932
Update 1 samples: 3031
Update 2 samples: 2730
Update 3 samples: 3772
Update 4 samples: 1688
Test samples: 807


In [None]:
print("Baseline distribution:")
print(baseline_df['label'].value_counts())

Baseline distribution:
label
1    6147
0    4785
Name: count, dtype: int64


#Faiss Index Creations


In [None]:
real_articles_file = "News_Category_Dataset_v3.json"

real_articles = []
with open(real_articles_file, 'r') as f:
    for line in f:
        try:
            art = json.loads(line)
            art_date = None
            if 'date' in art:
                try:
                    art_date = datetime.strptime(art['date'], '%Y-%m-%d')
                except Exception as e:
                    print(f"Error parsing date for article: {art.get('date')}, {e}")
            art['parsed_date'] = art_date
            real_articles.append(art)
        except Exception as e:
            print("Error parsing line:", e)

# Helper function to filter articles up to a given end_date
def filter_articles_by_date(articles, end_date_str):
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    return [art for art in articles if art['parsed_date'] is not None and art['parsed_date'] <= end_date]

Error parsing line: Unterminated string starting at: line 1 column 177 (char 176)


In [None]:
#Get Embeddings

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to build a FAISS index given a list of articles
def build_faiss_index(articles):
    headlines = [art['headline'].strip().lower() for art in articles]
    embeddings = embedding_model.encode(headlines, convert_to_numpy=True)
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index, headlines

# Build cumulative FAISS indexes for each time period:
# Baseline (up to 2015-12-31)
articles_baseline = filter_articles_by_date(real_articles, baseline_end)
index_baseline, headlines_baseline = build_faiss_index(articles_baseline)
print("Baseline FAISS index built with", len(articles_baseline), "articles.")

# Update 1
articles_update1 = filter_articles_by_date(real_articles, update1_end)
index_update1, headlines_update1 = build_faiss_index(articles_update1)
print("Update 1 FAISS index built with", len(articles_update1), "articles.")

# Update 2
articles_update2 = filter_articles_by_date(real_articles, update2_end)
index_update2, headlines_update2 = build_faiss_index(articles_update2)
print("Update 2 FAISS index built with", len(articles_update2), "articles.")

# Update 3
articles_update3 = filter_articles_by_date(real_articles, update3_end)
index_update3, headlines_update3 = build_faiss_index(articles_update3)
print("Update 3 FAISS index built with", len(articles_update3), "articles.")

# Update 4
articles_update4 = filter_articles_by_date(real_articles, update4_end)
index_update4, headlines_update4 = build_faiss_index(articles_update4)
print("Update 4 FAISS index built with", len(articles_update4), "articles.")


Baseline FAISS index built with 130283 articles.
Update 1 FAISS index built with 192270 articles.
Update 2 FAISS index built with 204009 articles.
Update 3 FAISS index built with 208129 articles.
Update 4 FAISS index built with 209527 articles.


In [None]:
test_query = "Over 4 million Americans get Omicron boosters"
# results = get_top_k_similar_headlines(test_query, embedding_model, index_update4, headlines_update4, k=3)

# print(results)


In [None]:
# Save each FAISS index to disk
faiss.write_index(index_baseline, "faiss_index_baseline.index")
faiss.write_index(index_update1, "faiss_index_update1.index")
faiss.write_index(index_update2, "faiss_index_update2.index")
faiss.write_index(index_update3, "faiss_index_update3.index")
faiss.write_index(index_update4, "faiss_index_update4.index")

# Compress the index files into a single zip archive
!zip faiss_indexes.zip faiss_index_baseline.index faiss_index_update1.index faiss_index_update2.index faiss_index_update3.index faiss_index_update4.index

  adding: faiss_index_baseline.index (deflated 8%)
  adding: faiss_index_update1.index (deflated 7%)
  adding: faiss_index_update2.index (deflated 7%)
  adding: faiss_index_update3.index (deflated 7%)
  adding: faiss_index_update4.index (deflated 7%)


In [None]:
import pickle

# To save the headlines:
headlines_data = {
    "baseline": headlines_baseline,
    "update1": headlines_update1,
    "update2": headlines_update2,
    "update3": headlines_update3,
    "update4": headlines_update4,
}

with open("faiss_headlines.pkl", "wb") as f:
    pickle.dump(headlines_data, f)

print("Headlines saved to faiss_headlines.pkl")


Headlines saved to faiss_headlines.pkl


#Load Faiss Indexes


In [None]:
import zipfile
import os
from google.colab import files
import faiss

zip_filename = "faiss_indexes_new.zip"
extract_dir = "faiss_indexes"
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted {zip_filename} to {extract_dir}")

# Load the FAISS indexes from the extracted folder.
index_baseline = faiss.read_index(os.path.join(extract_dir, "faiss_index_baseline.index"))
index_update1 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update1.index"))
index_update2 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update2.index"))
index_update3 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update3.index"))
index_update4 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update4.index"))

print("FAISS indexes loaded successfully!")


Extracted faiss_indexes_new.zip to faiss_indexes
FAISS indexes loaded successfully!


In [None]:
import pickle

with open("faiss_headlines_new.pkl", "rb") as f:
    headlines_data = pickle.load(f)

headlines_baseline = headlines_data["baseline"]
headlines_update1  = headlines_data["update1"]
headlines_update2  = headlines_data["update2"]
headlines_update3  = headlines_data["update3"]
headlines_update4  = headlines_data["update4"]

print("Headlines loaded successfully!")


Headlines loaded successfully!


In [None]:
zip_filename = "faiss_indexes_live.zip"
extract_dir = "faiss_indexes_live"
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted {zip_filename} to {extract_dir}")

# Load the FAISS indexes from the extracted folder.
index_live = faiss.read_index(os.path.join(extract_dir, "faiss_index_baseline.index"))


Extracted faiss_indexes_live.zip to faiss_indexes_live


In [None]:
with open("faiss_headlines_live.pkl", "rb") as f:
    headlines_data = pickle.load(f)

headlines_live = headlines_data["baseline"]

#RAG Models


In [None]:
sim_model = SentenceTransformer("all-MiniLM-L6-v2")
def get_top_k_similar_headlines(query_headline, k=3, faiss_index = None, headlines = None):
    query = query_headline.strip().lower()
    query_embedding = sim_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    distances, indices = faiss_index.search(query_embedding, k)
    results = []
    for rank, idx in enumerate(indices[0]):
        if idx == -1:
            continue
        # Combine the retrieved headline with its distance value
        hybrid_fact = f"{headlines[idx]}"
        results.append(hybrid_fact)
    return results


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def compute_entailment_score_for_sample(headline, similar_headlines, entailment_model, entailment_tokenizer):
    """
    Given a headline and its list of similar headlines, compute the aggregated entailment score.
    Uses the maximum probability for the ENTAILMENT class from a model like roberta-large-mnli.
    """
    pairs = [(headline, sim) for sim in similar_headlines]
    device = next(entailment_model.parameters()).device
    encoded = entailment_tokenizer.batch_encode_plus(
        pairs,
        return_tensors='pt',
        truncation=True,
        padding=True
    )
    encoded = {key: value.to(device) for key, value in encoded.items()}
    with torch.no_grad():
        outputs = entailment_model(**encoded)
    logits = outputs.logits  # shape: (k, 3)
    probs = F.softmax(logits, dim=-1)
    # For roberta-large-mnli, index 2 corresponds to ENTAILMENT.
    entailment_probs = probs[:, 2]
    aggregated_score = torch.max(entailment_probs).item()  # Using max, try averaging as well.
    return aggregated_score

In [None]:
def preprocess_hybrid_data(df, k, entailment_model, entailment_tokenizer, faiss_index, headlines):
    """
    Precompute the entailment scores for each sample.
    Expects df to have columns 'title' and 'label'.
    Returns a list of dictionaries with keys: 'headline', 'entailment_score', 'label'
    """
    precomputed_samples = []
    i = 0
    for idx in range(len(df)):
        i += 1
        if i % 1000 == 0:
            print(i)
        sample = df.iloc[idx]
        headline = sample['title']
        label = sample['label']
        similar_headlines = get_top_k_similar_headlines(headline, k, faiss_index, headlines)
        score = compute_entailment_score_for_sample(headline, similar_headlines, entailment_model, entailment_tokenizer)
        precomputed_samples.append({
            "headlines": headline,
            "entailment_scores": score,
            "label": label
        })
    return precomputed_samples

In [None]:
class HybridFakeNewsPrecomputedDataset(Dataset):
    def __init__(self, precomputed_samples):
        self._data = precomputed_samples

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        # If idx is a list, return a dictionary of lists.
        if isinstance(idx, list):
            batch = [self._data[i] for i in idx]
            # Convert list of dicts to dict of lists.
            aggregated = {key: [d[key] for d in batch] for key in batch[0].keys()}
            return aggregated
        else:
            return self._data[idx]

    @property
    def column_names(self):
        return list(self._data[0].keys()) if len(self._data) > 0 else []







In [None]:
import torch.nn as nn
class HybridFakeNewsClassifierPrecomputed(nn.Module):
    def __init__(self,
                 bert_model_name='bert-base-uncased',
                 entailment_threshold=0.8):
        """
        If the precomputed entailment score is above entailment_threshold,
        the sample is overridden to be classified as real news (label 1).
        """
        super(HybridFakeNewsClassifierPrecomputed, self).__init__()
        # BERT classifier for raw input.
        self.bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
        self.bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)
        self.entailment_threshold = entailment_threshold
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, headlines, entailment_scores, labels=None):
        """
        headlines: list of strings (batch of raw headlines)
        entailment_scores: tensor of shape (batch_size,) containing the precomputed scores.
        labels: tensor of true labels.
        """
        device = next(self.bert_model.parameters()).device
        # Tokenize and classify headlines with BERT.
        encoded = self.bert_tokenizer(headlines, return_tensors="pt", truncation=True, padding=True)
        encoded = {key: value.to(device) for key, value in encoded.items()}
        bert_outputs = self.bert_model(**encoded)
        bert_logits = bert_outputs.logits  # shape: (batch_size, 2)

        # Clone logits for override.
        final_logits = bert_logits.clone()
        # Create mask for samples with high entailment scores.
        entailment_scores = entailment_scores.to(device)
        high_mask = entailment_scores > self.entailment_threshold  # shape: (batch_size,)

        if high_mask.sum() > 0:
            # For these samples, override logits to strongly favor "real news" (label 1).
            override = torch.tensor([-10.0, 10.0], device=device)
            final_logits[high_mask] = override.unsqueeze(0).expand(high_mask.sum(), -1)

        if labels is not None:
            loss = self.loss_fn(final_logits, labels.to(device))
            return {'loss': loss, 'logits': final_logits}
        return {'logits': final_logits}


In [None]:
def collate_fn(batch):
    """
    Expects each item in batch to be a dict with keys: 'headline', 'entailment_score', 'label'
    """
    headlines = [item["headlines"] for item in batch]
    # Convert entailment scores to a tensor.
    entailment_scores = torch.tensor([item["entailment_scores"] for item in batch], dtype=torch.float)
    labels = torch.tensor([item["label"] for item in batch])
    return {"headlines": headlines, "entailment_scores": entailment_scores, "labels": labels}

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to predicted labels.
    preds = np.argmax(logits, axis=-1)

    # Overall accuracy.
    accuracy = accuracy_score(labels, preds)

    # F1 scores per class.
    f1_per_class = f1_score(labels, preds, average=None)

    # Confusion matrix.
    conf_matrix = confusion_matrix(labels, preds)

    # Full classification report.
    report = classification_report(labels, preds, output_dict=True)

    # For example, you can print the detailed report.
    print("Classification Report:")
    print(classification_report(labels, preds))
    print("Confusion Matrix:")
    print(conf_matrix)

    return {
        "accuracy": accuracy,
        "f1_per_class": f1_per_class.tolist(),
        "confusion_matrix": conf_matrix.tolist(),
        "report": report
    }

In [None]:
k = 3  # number of similar headlines to retrieve

    # Load the entailment and encoder models and tokenizers.
entailment_model_name = 'roberta-large-mnli'
entailment_tokenizer = AutoTokenizer.from_pretrained(entailment_model_name)
entailment_model = AutoModelForSequenceClassification.from_pretrained(entailment_model_name)

entailment_model.eval()
for param in entailment_model.parameters():
    param.requires_grad = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
entailment_model.to(device)

def train_model(model, train_df, test_df, faiss_index, headlines, faiss_test, headlines_test):
  train_samples = preprocess_hybrid_data(train_df, k, entailment_model, entailment_tokenizer, faiss_index, headlines)
  train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)
  test_samples = preprocess_hybrid_data(test_df, k, entailment_model, entailment_tokenizer, faiss_test, headlines_test)
  test_dataset = HybridFakeNewsPrecomputedDataset(test_samples)
  training_args = TrainingArguments(
          output_dir="./results",
          num_train_epochs=1,
          learning_rate=1e-5,
          per_device_train_batch_size=10,
          logging_steps=500,
          remove_unused_columns=False,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=train_dataset,
          eval_dataset=test_dataset,
          data_collator=collate_fn,
          compute_metrics=compute_metrics,
      )

  trainer.train()
  return model




Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
headlines = headlines_baseline
faiss_index = index_baseline

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:

# k = 3  # number of similar headlines to retrieve

#     # Load the entailment and encoder models and tokenizers.
# entailment_model_name = 'roberta-large-mnli'
# entailment_tokenizer = AutoTokenizer.from_pretrained(entailment_model_name)
# entailment_model = AutoModelForSequenceClassification.from_pretrained(entailment_model_name)

# entailment_model.eval()
# for param in entailment_model.parameters():
#     param.requires_grad = False

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# entailment_model.to(device)
# train_samples = preprocess_hybrid_data(baseline_df, k, entailment_model, entailment_tokenizer, faiss_index, headlines)
# train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)
# faiss_index = index_update4
# quick_test = baseline_df.iloc[:2]
model = HybridFakeNewsClassifierPrecomputed(bert_model_name='bert-base-uncased', entailment_threshold=0.8)
baseline_model = train_model(model, baseline_df, test_df, faiss_index, headlines, index_update4, headlines_update4)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,1.2023
1000,1.0896


In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import torch

def evaluate_model(model, test_df, collate_fn, faiss_test, headlines_test):
    """
    Evaluate the model on test data and print classification report

    Args:
        model: Trained HybridFakeNewsClassifierPrecomputed
        test_dataset: Preprocessed test dataset
        collate_fn: Collate function used during training
        device: Device to run evaluation on ('cuda' or 'cpu')
    """
    test_samples = preprocess_hybrid_data(test_df, k, entailment_model, entailment_tokenizer, faiss_test, headlines_test)
    test_dataset = HybridFakeNewsPrecomputedDataset(test_samples)
    # Set up DataLoader
    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        collate_fn=collate_fn,
        shuffle=False
    )

    # Move model to device and set to eval mode
    model = model.to(device)
    model.eval()

    # Storage for predictions and labels
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            # Move batch to device
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in batch.items()}

            # Forward pass
            outputs = model(
                headlines=batch["headlines"],
                entailment_scores=batch["entailment_scores"],
                labels=batch["labels"] if "labels" in batch else None
            )

            # Get predictions
            preds = torch.argmax(outputs["logits"], dim=1)
            all_preds.extend(preds.cpu().numpy())

            # Store labels if available
            if "labels" in batch:
                all_labels.extend(batch["labels"].cpu().numpy())

    # Print classification report if we have labels
    if all_labels:
        print("\nClassification Report:")
        print(classification_report(
            all_labels,
            all_preds,
            target_names=["Fake", "Real"],
            digits=4
        ))
    else:
        print("Predictions complete (no labels available for evaluation)")

    return all_preds

In [None]:
predictions = evaluate_model(
    model=baseline_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update4,
    headlines_test=headlines_update4
)



Classification Report:
              precision    recall  f1-score   support

        Fake     0.9584    0.8496    0.9008       705
        Real     0.4176    0.7451    0.5352       102

    accuracy                         0.8364       807
   macro avg     0.6880    0.7974    0.7180       807
weighted avg     0.8900    0.8364    0.8545       807



In [None]:
update_1_model = train_model(model, update1_df, test_df, index_update1, headlines_update1, index_update4, headlines_update4)

1000
2000
3000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


In [None]:
predictions = evaluate_model(
    model=update_1_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update4,
    headlines_test=headlines_update4
)


Classification Report:
              precision    recall  f1-score   support

        Fake     0.9566    0.8752    0.9141       705
        Real     0.4568    0.7255    0.5606       102

    accuracy                         0.8563       807
   macro avg     0.7067    0.8003    0.7373       807
weighted avg     0.8934    0.8563    0.8694       807



In [None]:
update_2_model = train_model(update_1_model, update2_df, test_df, index_update2, headlines_update2, index_update4, headlines_update4)

1000
2000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


In [None]:
predictions = evaluate_model(
    model=update_2_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update4,
    headlines_test=headlines_update4
)


Classification Report:
              precision    recall  f1-score   support

        Fake     0.9394    0.9234    0.9313       705
        Real     0.5263    0.5882    0.5556       102

    accuracy                         0.8810       807
   macro avg     0.7329    0.7558    0.7434       807
weighted avg     0.8872    0.8810    0.8838       807



In [None]:
update_3_model = train_model(update_2_model, update3_df, test_df, index_update3, headlines_update3, index_update4, headlines_update4)

1000
2000
3000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


In [None]:
predictions = evaluate_model(
    model=update_3_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update4,
    headlines_test=headlines_update4
)


Classification Report:
              precision    recall  f1-score   support

        Fake     0.9317    0.9475    0.9395       705
        Real     0.5889    0.5196    0.5521       102

    accuracy                         0.8934       807
   macro avg     0.7603    0.7336    0.7458       807
weighted avg     0.8883    0.8934    0.8906       807



In [None]:
update_4_model = train_model(update_3_model, update4_df, test_df, index_update4, headlines_update4, index_update4, headlines_update4)

1000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


In [None]:
predictions = evaluate_model(
    model=update_4_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update4,
    headlines_test=headlines_update4
)


Classification Report:
              precision    recall  f1-score   support

        Fake     0.9212    0.9617    0.9410       705
        Real     0.6197    0.4314    0.5087       102

    accuracy                         0.8947       807
   macro avg     0.7705    0.6965    0.7248       807
weighted avg     0.8831    0.8947    0.8864       807



In [None]:
predictions = evaluate_model(
    model=baseline_model,
    test_df=df_live,
    collate_fn=collate_fn,
    faiss_test=index_live,
    headlines_test=headlines_live
)


Classification Report:
              precision    recall  f1-score   support

        Fake     0.5000    0.9444    0.6538        18
        Real     0.5000    0.0556    0.1000        18

    accuracy                         0.5000        36
   macro avg     0.5000    0.5000    0.3769        36
weighted avg     0.5000    0.5000    0.3769        36



In [None]:
model = HybridFakeNewsClassifierPrecomputed(bert_model_name='bert-base-uncased', entailment_threshold=0.8)
baseline_model = train_model(model, baseline_df, test_df, faiss_index, headlines, index_baseline, headlines_baseline)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,1.2111
1000,1.096


In [None]:
predictions = evaluate_model(
    model=baseline_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update1,
    headlines_test=headlines_update1
)



Classification Report:
              precision    recall  f1-score   support

        Fake     0.9578    0.7730    0.8556       705
        Real     0.3277    0.7647    0.4588       102

    accuracy                         0.7720       807
   macro avg     0.6428    0.7689    0.6572       807
weighted avg     0.8782    0.7720    0.8054       807



In [None]:
headlines_baseline

predictions = evaluate_model(
    model=baseline_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_baseline,
    headlines_test=headlines_baseline
)



Classification Report:
              precision    recall  f1-score   support

        Fake     0.9600    0.7489    0.8414       705
        Real     0.3113    0.7843    0.4457       102

    accuracy                         0.7534       807
   macro avg     0.6356    0.7666    0.6436       807
weighted avg     0.8780    0.7534    0.7914       807



In [None]:
# from transformers import (
#     BertForSequenceClassification,
#     BertTokenizer,
#     TrainingArguments,
#     Trainer
# )
# from torch.utils.data import Dataset
# import torch
# import numpy as np
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# class FakeNewsBaseBertDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length=128):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = str(self.texts[idx])
#         label = int(self.labels[idx])

#         encoding = self.tokenizer(
#             text,
#             max_length=self.max_length,
#             padding='max_length',
#             truncation=True,
#             return_tensors='pt'
#         )

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'labels': torch.tensor(label, dtype=torch.long)
#         }

# def compute_metrics_base(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)

#     accuracy = accuracy_score(labels, predictions)
#     precision, recall, f1, _ = precision_recall_fscore_support(
#         labels, predictions, average='binary'
#     )

#     return {
#         'accuracy': accuracy,
#         'precision': precision,
#         'recall': recall,
#         'f1': f1
#     }

# def train_bert_fake_news(train_df, test_df, model_name='bert-base-uncased',model = None):
#     # Initialize tokenizer and model
#     tokenizer = BertTokenizer.from_pretrained(model_name)

#     # Create datasets
#     train_dataset = FakeNewsBaseBertDataset(
#         texts=train_df['title'].values,
#         labels=train_df['label'].values,
#         tokenizer=tokenizer
#     )

#     test_dataset = FakeNewsBaseBertDataset(
#         texts=test_df['title'].values,
#         labels=test_df['label'].values,
#         tokenizer=tokenizer
#     )

#     # Training arguments (consistent with previous code)
#     training_args = TrainingArguments(
#         output_dir='./bert_results',
#         num_train_epochs=1,
#         learning_rate=1e-5,
#         per_device_train_batch_size=10,
#         per_device_eval_batch_size=10,
#         evaluation_strategy='steps',
#         logging_steps=500,
#         save_steps=500,
#         remove_unused_columns=True,
#         load_best_model_at_end=True,
#         metric_for_best_model='f1',
#     )

#     # Initialize Trainer
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=test_dataset,
#         compute_metrics=compute_metrics_base,
#     )

#     # Train and evaluate
#     print("Starting training...")
#     trainer.train()

#     print("\nEvaluation results:")
#     eval_results = trainer.evaluate()
#     print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
#     print(f"Precision: {eval_results['eval_precision']:.4f}")
#     print(f"Recall: {eval_results['eval_recall']:.4f}")
#     print(f"F1 Score: {eval_results['eval_f1']:.4f}")

#     return model, tokenizer

# # Example usage:
# # model, tokenizer = train_bert_fake_news(train_df, test_df)