In [None]:
#imports
!pip install transformers datasets torch scikit-learn pandas matplotlib faiss-cpu sentence-transformers
import os
import json
import faiss
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer

np.random.seed(42)

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_6

In [None]:
df = pd.read_csv("Liar2_combined.csv", header = 0)

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
df = df.dropna(subset=['date'])

print(df.head())


   label                                              title       date
0      1  90 percent of Americans "support universal bac... 2017-10-02
1      0  Last year was one of the deadliest years ever ... 2017-05-19
2      0  Bernie Sanders's plan is "to raise your taxes ... 2015-10-28
3      1  Voter ID is supported by an overwhelming major... 2021-12-08
4      0  Says Barack Obama "robbed Medicare (of) $716 b... 2012-08-12


In [None]:
#Defining our Date Ranges
baseline_start, baseline_end = '2007-01-01', '2015-12-31'
update1_start, update1_end   = '2016-01-01', '2017-12-31'
update2_start, update2_end   = '2018-01-01', '2019-12-31'
update3_start, update3_end   = '2020-01-01', '2021-12-31'
update4_start, update4_end   = '2022-01-01', '2022-12-31'
test_start, test_end         = '2023-01-01', '2023-12-31'

# Baseline training set: entries w/ date <= split_date
baseline_df = df[(df['date'] >= baseline_start) & (df['date'] <= baseline_end)].copy()
update1_df = df[(df['date'] >= update1_start) & (df['date'] <= update1_end)].copy()
update2_df = df[(df['date'] >= update2_start) & (df['date'] <= update2_end)].copy()
update3_df = df[(df['date'] >= update3_start) & (df['date'] <= update3_end)].copy()
update4_df = df[(df['date'] >= update4_start) & (df['date'] <= update4_end)].copy()
test_df = df[(df['date'] >= test_start) & (df['date'] <= test_end)].copy()

# Display sample sizes for each block
print("Baseline samples:", len(baseline_df))
print("Update 1 samples:", len(update1_df))
print("Update 2 samples:", len(update2_df))
print("Update 3 samples:", len(update3_df))
print("Update 4 samples:", len(update4_df))
print("Test samples:", len(test_df))

Baseline samples: 10932
Update 1 samples: 3031
Update 2 samples: 2730
Update 3 samples: 3772
Update 4 samples: 1688
Test samples: 807


In [None]:
print("Baseline distribution:")
print(baseline_df['label'].value_counts())

Baseline distribution:
label
1    6147
0    4785
Name: count, dtype: int64


#Faiss Index Creations


In [None]:
real_articles_file = "News_Category_Dataset_v3.json"

real_articles = []
with open(real_articles_file, 'r') as f:
    for line in f:
        try:
            art = json.loads(line)
            art_date = None
            if 'date' in art:
                try:
                    art_date = datetime.strptime(art['date'], '%Y-%m-%d')
                except Exception as e:
                    print(f"Error parsing date for article: {art.get('date')}, {e}")
            art['parsed_date'] = art_date
            real_articles.append(art)
        except Exception as e:
            print("Error parsing line:", e)

# Helper function to filter articles up to a given end_date
def filter_articles_by_date(articles, end_date_str):
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    return [art for art in articles if art['parsed_date'] is not None and art['parsed_date'] <= end_date]

Error parsing line: Unterminated string starting at: line 1 column 177 (char 176)


In [None]:
#Get Embeddings

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to build a FAISS index given a list of articles
def build_faiss_index(articles):
    headlines = [art['headline'].strip().lower() for art in articles]
    embeddings = embedding_model.encode(headlines, convert_to_numpy=True)
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index, headlines

# Build cumulative FAISS indexes for each time period:
# Baseline (up to 2015-12-31)
articles_baseline = filter_articles_by_date(real_articles, baseline_end)
index_baseline, headlines_baseline = build_faiss_index(articles_baseline)
print("Baseline FAISS index built with", len(articles_baseline), "articles.")

# Update 1
articles_update1 = filter_articles_by_date(real_articles, update1_end)
index_update1, headlines_update1 = build_faiss_index(articles_update1)
print("Update 1 FAISS index built with", len(articles_update1), "articles.")

# Update 2
articles_update2 = filter_articles_by_date(real_articles, update2_end)
index_update2, headlines_update2 = build_faiss_index(articles_update2)
print("Update 2 FAISS index built with", len(articles_update2), "articles.")

# Update 3
articles_update3 = filter_articles_by_date(real_articles, update3_end)
index_update3, headlines_update3 = build_faiss_index(articles_update3)
print("Update 3 FAISS index built with", len(articles_update3), "articles.")

# Update 4
articles_update4 = filter_articles_by_date(real_articles, update4_end)
index_update4, headlines_update4 = build_faiss_index(articles_update4)
print("Update 4 FAISS index built with", len(articles_update4), "articles.")


Baseline FAISS index built with 130283 articles.
Update 1 FAISS index built with 192270 articles.
Update 2 FAISS index built with 204009 articles.
Update 3 FAISS index built with 208129 articles.
Update 4 FAISS index built with 209527 articles.


In [None]:
test_query = "Over 4 million Americans get Omicron boosters"
# results = get_top_k_similar_headlines(test_query, embedding_model, index_update4, headlines_update4, k=3)

# print(results)


In [None]:
# Save each FAISS index to disk
faiss.write_index(index_baseline, "faiss_index_baseline.index")
faiss.write_index(index_update1, "faiss_index_update1.index")
faiss.write_index(index_update2, "faiss_index_update2.index")
faiss.write_index(index_update3, "faiss_index_update3.index")
faiss.write_index(index_update4, "faiss_index_update4.index")

# Compress the index files into a single zip archive
!zip faiss_indexes.zip faiss_index_baseline.index faiss_index_update1.index faiss_index_update2.index faiss_index_update3.index faiss_index_update4.index

  adding: faiss_index_baseline.index (deflated 8%)
  adding: faiss_index_update1.index (deflated 7%)
  adding: faiss_index_update2.index (deflated 7%)
  adding: faiss_index_update3.index (deflated 7%)
  adding: faiss_index_update4.index (deflated 7%)


In [None]:
import pickle

# To save the headlines:
headlines_data = {
    "baseline": headlines_baseline,
    "update1": headlines_update1,
    "update2": headlines_update2,
    "update3": headlines_update3,
    "update4": headlines_update4,
}

with open("faiss_headlines.pkl", "wb") as f:
    pickle.dump(headlines_data, f)

print("Headlines saved to faiss_headlines.pkl")


Headlines saved to faiss_headlines.pkl


#Load Faiss Indexes


In [None]:
import zipfile
import os
from google.colab import files
import faiss

zip_filename = "faiss_indexes_new.zip"
extract_dir = "faiss_indexes"
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print(f"Extracted {zip_filename} to {extract_dir}")

# Load the FAISS indexes from the extracted folder.
index_baseline = faiss.read_index(os.path.join(extract_dir, "faiss_index_baseline.index"))
index_update1 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update1.index"))
index_update2 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update2.index"))
index_update3 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update3.index"))
index_update4 = faiss.read_index(os.path.join(extract_dir, "faiss_index_update4.index"))

print("FAISS indexes loaded successfully!")


Extracted faiss_indexes_new.zip to faiss_indexes
FAISS indexes loaded successfully!


In [None]:
import pickle

with open("faiss_headlines_new.pkl", "rb") as f:
    headlines_data = pickle.load(f)

headlines_baseline = headlines_data["baseline"]
headlines_update1  = headlines_data["update1"]
headlines_update2  = headlines_data["update2"]
headlines_update3  = headlines_data["update3"]
headlines_update4  = headlines_data["update4"]

print("Headlines loaded successfully!")


Headlines loaded successfully!


#RAG Models


In [None]:
sim_model = SentenceTransformer("all-MiniLM-L6-v2")
def get_top_k_similar_headlines(query_headline, k=3, faiss_index = None, headlines = None):
    query = query_headline.strip().lower()
    query_embedding = sim_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    distances, indices = faiss_index.search(query_embedding, k)
    results = []
    for rank, idx in enumerate(indices[0]):
        if idx == -1:
            continue
        # Combine the retrieved headline with its distance value
        hybrid_fact = f"{headlines[idx]}"
        results.append(hybrid_fact)
    return results


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def compute_entailment_score_for_sample(headline, similar_headlines, entailment_model, entailment_tokenizer):
    """
    Given a headline and its list of similar headlines, compute the aggregated entailment score.
    Uses the maximum probability for the ENTAILMENT class from a model like roberta-large-mnli.
    """
    pairs = [(headline, sim) for sim in similar_headlines]
    device = next(entailment_model.parameters()).device
    encoded = entailment_tokenizer.batch_encode_plus(
        pairs,
        return_tensors='pt',
        truncation=True,
        padding=True
    )
    encoded = {key: value.to(device) for key, value in encoded.items()}
    with torch.no_grad():
        outputs = entailment_model(**encoded)
    logits = outputs.logits  # shape: (k, 3)
    probs = F.softmax(logits, dim=-1)
    # For roberta-large-mnli, index 2 corresponds to ENTAILMENT.
    entailment_probs = probs[:, 2]
    aggregated_score = torch.max(entailment_probs).item()  # Using max, try averaging as well.
    return aggregated_score

In [None]:
def preprocess_hybrid_data(df, k, entailment_model, entailment_tokenizer, faiss_index, headlines):
    """
    Precompute the entailment scores for each sample.
    Expects df to have columns 'title' and 'label'.
    Returns a list of dictionaries with keys: 'headline', 'entailment_score', 'label'
    """
    precomputed_samples = []
    i = 0
    for idx in range(len(df)):
        i += 1
        if i % 1000 == 0:
            print(i)
        sample = df.iloc[idx]
        headline = sample['title']
        label = sample['label']
        similar_headlines = get_top_k_similar_headlines(headline, k, faiss_index, headlines)
        score = compute_entailment_score_for_sample(headline, similar_headlines, entailment_model, entailment_tokenizer)
        precomputed_samples.append({
            "headlines": headline,
            "entailment_scores": score,
            "label": label
        })
    return precomputed_samples

In [None]:
class HybridFakeNewsPrecomputedDataset(Dataset):
    def __init__(self, precomputed_samples):
        self._data = precomputed_samples

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        # If idx is a list, return a dictionary of lists.
        if isinstance(idx, list):
            batch = [self._data[i] for i in idx]
            # Convert list of dicts to dict of lists.
            aggregated = {key: [d[key] for d in batch] for key in batch[0].keys()}
            return aggregated
        else:
            return self._data[idx]

    @property
    def column_names(self):
        return list(self._data[0].keys()) if len(self._data) > 0 else []







In [None]:
import torch.nn as nn
class HybridFakeNewsClassifierPrecomputed(nn.Module):
    def __init__(self,
                 bert_model_name='bert-base-uncased',
                 entailment_threshold=0.8):
        """
        If the precomputed entailment score is above entailment_threshold,
        the sample is overridden to be classified as real news (label 1).
        """
        super(HybridFakeNewsClassifierPrecomputed, self).__init__()
        # BERT classifier for raw input.
        self.bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
        self.bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)
        self.entailment_threshold = entailment_threshold
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, headlines, entailment_scores, labels=None):
        """
        headlines: list of strings (batch of raw headlines)
        entailment_scores: tensor of shape (batch_size,) containing the precomputed scores.
        labels: tensor of true labels.
        """
        device = next(self.bert_model.parameters()).device
        # Tokenize and classify headlines with BERT.
        encoded = self.bert_tokenizer(headlines, return_tensors="pt", truncation=True, padding=True)
        encoded = {key: value.to(device) for key, value in encoded.items()}
        bert_outputs = self.bert_model(**encoded)
        bert_logits = bert_outputs.logits  # shape: (batch_size, 2)

        # Clone logits for override.
        final_logits = bert_logits.clone()
        # Create mask for samples with high entailment scores.
        entailment_scores = entailment_scores.to(device)
        high_mask = entailment_scores > self.entailment_threshold  # shape: (batch_size,)

        if high_mask.sum() > 0:
            # For these samples, override logits to strongly favor "real news" (label 1).
            override = torch.tensor([-10.0, 10.0], device=device)
            final_logits[high_mask] = override.unsqueeze(0).expand(high_mask.sum(), -1)

        if labels is not None:
            loss = self.loss_fn(final_logits, labels.to(device))
            return {'loss': loss, 'logits': final_logits}
        return {'logits': final_logits}


In [None]:
def collate_fn(batch):
    """
    Expects each item in batch to be a dict with keys: 'headline', 'entailment_score', 'label'
    """
    headlines = [item["headlines"] for item in batch]
    # Convert entailment scores to a tensor.
    entailment_scores = torch.tensor([item["entailment_scores"] for item in batch], dtype=torch.float)
    labels = torch.tensor([item["label"] for item in batch])
    return {"headlines": headlines, "entailment_scores": entailment_scores, "labels": labels}

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to predicted labels.
    preds = np.argmax(logits, axis=-1)

    # Overall accuracy.
    accuracy = accuracy_score(labels, preds)

    # F1 scores per class.
    f1_per_class = f1_score(labels, preds, average=None)

    # Confusion matrix.
    conf_matrix = confusion_matrix(labels, preds)

    # Full classification report.
    report = classification_report(labels, preds, output_dict=True)

    # For example, you can print the detailed report.
    print("Classification Report:")
    print(classification_report(labels, preds))
    print("Confusion Matrix:")
    print(conf_matrix)

    return {
        "accuracy": accuracy,
        "f1_per_class": f1_per_class.tolist(),
        "confusion_matrix": conf_matrix.tolist(),
        "report": report
    }

In [None]:
k = 3  # number of similar headlines to retrieve

    # Load the entailment and encoder models and tokenizers.
entailment_model_name = 'roberta-large-mnli'
entailment_tokenizer = AutoTokenizer.from_pretrained(entailment_model_name)
entailment_model = AutoModelForSequenceClassification.from_pretrained(entailment_model_name)

entailment_model.eval()
for param in entailment_model.parameters():
    param.requires_grad = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
entailment_model.to(device)

def train_model(model, train_df, test_df, faiss_index, headlines, faiss_test, headlines_test):
  train_samples = preprocess_hybrid_data(train_df, k, entailment_model, entailment_tokenizer, faiss_index, headlines)
  train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)
  test_samples = preprocess_hybrid_data(test_df, k, entailment_model, entailment_tokenizer, faiss_test, headlines_test)
  test_dataset = HybridFakeNewsPrecomputedDataset(test_samples)
  training_args = TrainingArguments(
          output_dir="./results",
          num_train_epochs=1,
          learning_rate=1e-5,
          per_device_train_batch_size=10,
          evaluation_strategy="steps",
          logging_steps=500,
          save_steps=500,
          remove_unused_columns=False,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=train_dataset,
          eval_dataset=test_dataset,
          data_collator=collate_fn,
          compute_metrics=compute_metrics,
      )

  trainer.train()
  return model




Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
headlines = headlines_baseline
faiss_index = index_baseline

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:

# k = 3  # number of similar headlines to retrieve

#     # Load the entailment and encoder models and tokenizers.
# entailment_model_name = 'roberta-large-mnli'
# entailment_tokenizer = AutoTokenizer.from_pretrained(entailment_model_name)
# entailment_model = AutoModelForSequenceClassification.from_pretrained(entailment_model_name)

# entailment_model.eval()
# for param in entailment_model.parameters():
#     param.requires_grad = False

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# entailment_model.to(device)
# train_samples = preprocess_hybrid_data(baseline_df, k, entailment_model, entailment_tokenizer, faiss_index, headlines)
# train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)
model = HybridFakeNewsClassifierPrecomputed(bert_model_name='bert-base-uncased', entailment_threshold=0.8)
baseline_model = train_model(model, baseline_df, test_df, faiss_index, headlines, faiss_index, headlines)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
500,1.1964,1.837644,0.790582,"[0.8688906128782001, 0.48]","[[560, 145], [24, 78]]","{'0': {'precision': 0.958904109589041, 'recall': 0.7943262411347518, 'f1-score': 0.8688906128782001, 'support': 705.0}, '1': {'precision': 0.34977578475336324, 'recall': 0.7647058823529411, 'f1-score': 0.48, 'support': 102.0}, 'accuracy': 0.7905824039653035, 'macro avg': {'precision': 0.6543399471712021, 'recall': 0.7795160617438465, 'f1-score': 0.6744453064391001, 'support': 807.0}, 'weighted avg': {'precision': 0.8819139123979144, 'recall': 0.7905824039653035, 'f1-score': 0.8197371525144128, 'support': 807.0}}"
1000,1.095,1.81801,0.783147,"[0.8627450980392157, 0.4837758112094395]","[[550, 155], [20, 82]]","{'0': {'precision': 0.9649122807017544, 'recall': 0.7801418439716312, 'f1-score': 0.8627450980392157, 'support': 705.0}, '1': {'precision': 0.3459915611814346, 'recall': 0.803921568627451, 'f1-score': 0.4837758112094395, 'support': 102.0}, 'accuracy': 0.7831474597273854, 'macro avg': {'precision': 0.6554519209415945, 'recall': 0.7920317062995411, 'f1-score': 0.6732604546243276, 'support': 807.0}, 'weighted avg': {'precision': 0.8866843830672159, 'recall': 0.7831474597273854, 'f1-score': 0.8148456342763444, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.8688906128782001, 0.48]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[560, 145], [24, 78]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.958904109589041, 'recall': 0.7943262411347518, 'f1-score': 0.8688906128782001, 'support': 705.0}, '1': {'precision': 0.34977578475336324, 'recall': 0.7647058823529411, 'f1-score': 0.48, 'support': 102.0}, 'accuracy': 0.7905824039653035, 'macro avg': {'precision': 0.6543399471712021, 'recall': 0.7795160617438465, 'f1-score': 0.6744453064391001, 'support': 807.0}, 'weighted avg': {'precision': 0.8819139123979144, 'recall': 0.7905824039653035, 'f1-score': 0.8197371525144128, 's

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.79      0.87       705
           1       0.35      0.76      0.48       102

    accuracy                           0.79       807
   macro avg       0.65      0.78      0.67       807
weighted avg       0.88      0.79      0.82       807

Confusion Matrix:
[[560 145]
 [ 24  78]]


Trainer is attempting to log a value of "[0.8627450980392157, 0.4837758112094395]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[550, 155], [20, 82]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9649122807017544, 'recall': 0.7801418439716312, 'f1-score': 0.8627450980392157, 'support': 705.0}, '1': {'precision': 0.3459915611814346, 'recall': 0.803921568627451, 'f1-score': 0.4837758112094395, 'support': 102.0}, 'accuracy': 0.7831474597273854, 'macro avg': {'precision': 0.6554519209415945, 'recall': 0.7920317062995411, 'f1-score': 0.6732604546243276, 'support': 807.0}, 'weighted avg': {'precision': 0.8866843830672159, 'recall': 0.7831474597273854, 'f1-sco

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.78      0.86       705
           1       0.35      0.80      0.48       102

    accuracy                           0.78       807
   macro avg       0.66      0.79      0.67       807
weighted avg       0.89      0.78      0.81       807

Confusion Matrix:
[[550 155]
 [ 20  82]]


In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import torch

def evaluate_model(model, test_df, collate_fn, faiss_test, headlines_test):
    """
    Evaluate the model on test data and print classification report

    Args:
        model: Trained HybridFakeNewsClassifierPrecomputed
        test_dataset: Preprocessed test dataset
        collate_fn: Collate function used during training
        device: Device to run evaluation on ('cuda' or 'cpu')
    """
    test_samples = preprocess_hybrid_data(test_df, k, entailment_model, entailment_tokenizer, faiss_test, headlines_test)
    test_dataset = HybridFakeNewsPrecomputedDataset(test_samples)
    # Set up DataLoader
    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        collate_fn=collate_fn,
        shuffle=False
    )

    # Move model to device and set to eval mode
    model = model.to(device)
    model.eval()

    # Storage for predictions and labels
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            # Move batch to device
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in batch.items()}

            # Forward pass
            outputs = model(
                headlines=batch["headlines"],
                entailment_scores=batch["entailment_scores"],
                labels=batch["labels"] if "labels" in batch else None
            )

            # Get predictions
            preds = torch.argmax(outputs["logits"], dim=1)
            all_preds.extend(preds.cpu().numpy())

            # Store labels if available
            if "labels" in batch:
                all_labels.extend(batch["labels"].cpu().numpy())

    # Print classification report if we have labels
    if all_labels:
        print("\nClassification Report:")
        print(classification_report(
            all_labels,
            all_preds,
            target_names=["Fake", "Real"],
            digits=4
        ))
    else:
        print("Predictions complete (no labels available for evaluation)")

    return all_preds

In [None]:
predictions = evaluate_model(
    model=baseline_model,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=faiss_index,
    headlines_test=headlines
)



Classification Report:
              precision    recall  f1-score   support

        Fake     0.9608    0.7986    0.8722       705
        Real     0.3575    0.7745    0.4892       102

    accuracy                         0.7955       807
   macro avg     0.6591    0.7865    0.6807       807
weighted avg     0.8845    0.7955    0.8238       807



In [None]:
print(predictions)

[np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0)

In [None]:
# # Initialize a new model instance with the same parameters
# model = HybridFakeNewsClassifierPrecomputed(
#     bert_model_name='bert-base-uncased',
#     entailment_threshold=0.8
# )

# # Load the saved state dictionary into the model
# model.load_state_dict(torch.load(os.path.join(checkpoint_dir, "custom_model.bin")))
# print("Model loaded successfully.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully.


In [None]:
model = HybridFakeNewsClassifierPrecomputed(
    bert_model_name='bert-base-uncased',
    entailment_threshold=0.8
)

update12_df = pd.concat([update1_df, update2_df], ignore_index=True)



update12_scratch = train_model(model, update12_df, test_df, index_update2, headlines_update2, index_update2, headlines_update2)


# headlines = headlines_update2
# faiss_index = index_update2

# test_samples = preprocess_hybrid_data(test_df, k, entailment_model, entailment_tokenizer)
# test_dataset = HybridFakeNewsPrecomputedDataset(test_samples)


# train_samples = preprocess_hybrid_data(update1_df, k, entailment_model, entailment_tokenizer)
# train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1000
2000
3000
4000
5000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
500,1.2471,1.743537,0.807931,"[0.8824867323730099, 0.4745762711864407]","[[582, 123], [32, 70]]","{'0': {'precision': 0.9478827361563518, 'recall': 0.825531914893617, 'f1-score': 0.8824867323730099, 'support': 705.0}, '1': {'precision': 0.3626943005181347, 'recall': 0.6862745098039216, 'f1-score': 0.4745762711864407, 'support': 102.0}, 'accuracy': 0.8079306071871127, 'macro avg': {'precision': 0.6552885183372432, 'recall': 0.7559032123487692, 'f1-score': 0.6785315017797253, 'support': 807.0}, 'weighted avg': {'precision': 0.8739183985663912, 'recall': 0.8079306071871127, 'f1-score': 0.830929276312254, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.8824867323730099, 0.4745762711864407]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[582, 123], [32, 70]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9478827361563518, 'recall': 0.825531914893617, 'f1-score': 0.8824867323730099, 'support': 705.0}, '1': {'precision': 0.3626943005181347, 'recall': 0.6862745098039216, 'f1-score': 0.4745762711864407, 'support': 102.0}, 'accuracy': 0.8079306071871127, 'macro avg': {'precision': 0.6552885183372432, 'recall': 0.7559032123487692, 'f1-score': 0.6785315017797253, 'support': 807.0}, 'weighted avg': {'precision': 0.8739183985663912, 'recall': 0.8079306071871127, 'f1-sco

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.83      0.88       705
           1       0.36      0.69      0.47       102

    accuracy                           0.81       807
   macro avg       0.66      0.76      0.68       807
weighted avg       0.87      0.81      0.83       807

Confusion Matrix:
[[582 123]
 [ 32  70]]


In [None]:
update12_contiual = train_model(baseline_model, update12_df, test_df, index_update2, headlines_update2, index_update2, headlines_update2)

1000
2000
3000
4000
5000


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
500,1.2379,1.718499,0.821561,"[0.8912386706948641, 0.503448275862069]","[[590, 115], [29, 73]]","{'0': {'precision': 0.9531502423263328, 'recall': 0.8368794326241135, 'f1-score': 0.8912386706948641, 'support': 705.0}, '1': {'precision': 0.3882978723404255, 'recall': 0.7156862745098039, 'f1-score': 0.503448275862069, 'support': 102.0}, 'accuracy': 0.8215613382899628, 'macro avg': {'precision': 0.6707240573333791, 'recall': 0.7762828535669587, 'f1-score': 0.6973434732784665, 'support': 807.0}, 'weighted avg': {'precision': 0.881756262476813, 'recall': 0.8215613382899628, 'f1-score': 0.842224271347968, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.8912386706948641, 0.503448275862069]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[590, 115], [29, 73]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9531502423263328, 'recall': 0.8368794326241135, 'f1-score': 0.8912386706948641, 'support': 705.0}, '1': {'precision': 0.3882978723404255, 'recall': 0.7156862745098039, 'f1-score': 0.503448275862069, 'support': 102.0}, 'accuracy': 0.8215613382899628, 'macro avg': {'precision': 0.6707240573333791, 'recall': 0.7762828535669587, 'f1-score': 0.6973434732784665, 'support': 807.0}, 'weighted avg': {'precision': 0.881756262476813, 'recall': 0.8215613382899628, 'f1-score

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       705
           1       0.39      0.72      0.50       102

    accuracy                           0.82       807
   macro avg       0.67      0.78      0.70       807
weighted avg       0.88      0.82      0.84       807

Confusion Matrix:
[[590 115]
 [ 29  73]]


In [None]:
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    TrainingArguments,
    Trainer
)
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

class FakeNewsBaseBertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def compute_metrics_base(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def train_bert_fake_news(train_df, test_df, model_name='bert-base-uncased',model = None):
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Create datasets
    train_dataset = FakeNewsBaseBertDataset(
        texts=train_df['title'].values,
        labels=train_df['label'].values,
        tokenizer=tokenizer
    )

    test_dataset = FakeNewsBaseBertDataset(
        texts=test_df['title'].values,
        labels=test_df['label'].values,
        tokenizer=tokenizer
    )

    # Training arguments (consistent with previous code)
    training_args = TrainingArguments(
        output_dir='./bert_results',
        num_train_epochs=1,
        learning_rate=1e-5,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=10,
        evaluation_strategy='steps',
        logging_steps=500,
        save_steps=500,
        remove_unused_columns=True,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_base,
    )

    # Train and evaluate
    print("Starting training...")
    trainer.train()

    print("\nEvaluation results:")
    eval_results = trainer.evaluate()
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1 Score: {eval_results['eval_f1']:.4f}")

    return model, tokenizer

# Example usage:
# model, tokenizer = train_bert_fake_news(train_df, test_df)

In [None]:
untrained_base_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
base_bert_model, base_bert_tokenizer = train_bert_fake_news(baseline_df, test_df, model = untrained_base_bert)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.6695,0.612389,0.778191,0.342857,0.823529,0.48415
1000,0.6481,0.583498,0.784387,0.352459,0.843137,0.49711



Evaluation results:


Accuracy: 0.7844
Precision: 0.3525
Recall: 0.8431
F1 Score: 0.4971


In [None]:
base_bert_model_update1, base_bert_tokenizer = train_bert_fake_news(update12_df, test_df, model = base_bert_model)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting training...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.6014,0.326637,0.881041,0.521739,0.705882,0.6



Evaluation results:


Accuracy: 0.8810
Precision: 0.5217
Recall: 0.7059
F1 Score: 0.6000


In [None]:
def get_base_bert_model_predictions(model, tokenizer):
    model.eval()
    model.to(device)
    predictions = []

    test_dataset = FakeNewsBaseBertDataset(
        texts=test_df['title'].values,
        labels=test_df['label'].values,
        tokenizer=tokenizer
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        shuffle=False
    )

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)

    return np.array(predictions)


In [None]:
pred_base = get_base_bert_model_predictions(base_bert_model_update1, base_bert_tokenizer)

predictions_continual = evaluate_model(
    model=update12_contiual,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update2,
    headlines_test=headlines_update2
)


predictions_scratch = evaluate_model(
    model=update12_scratch,
    test_df=test_df,
    collate_fn=collate_fn,
    faiss_test=index_update2,
    headlines_test=headlines_update2
)


Classification Report:
              precision    recall  f1-score   support

        Fake     0.9516    0.8369    0.8906       705
        Real     0.3850    0.7059    0.4983       102

    accuracy                         0.8203       807
   macro avg     0.6683    0.7714    0.6944       807
weighted avg     0.8800    0.8203    0.8410       807


Classification Report:
              precision    recall  f1-score   support

        Fake     0.9467    0.8312    0.8852       705
        Real     0.3670    0.6765    0.4759       102

    accuracy                         0.8116       807
   macro avg     0.6569    0.7538    0.6805       807
weighted avg     0.8734    0.8116    0.8335       807



In [None]:
print(pred_base)
print(predictions_continual)
print(predictions_scratch)

[0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 1 1 0 0 0 0 0 

In [None]:
from sklearn.metrics import classification_report
import numpy as np

def ensemble_predictions(model_preds, labels):
    """
    Perform majority voting ensemble on predictions from 3 models and print classification report

    Args:
        model_preds: List of 3 prediction arrays (each shape [n_samples,])
        test_dataset: Dataset containing true labels
    """
    # Extract true labels from test dataset
    true_labels = labels

    # Convert predictions to numpy arrays if they're not already
    preds = [np.array(p) for p in model_preds]

    # Perform majority voting
    ensemble_preds = []
    for i in range(len(true_labels)):
        votes = [preds[0][i], preds[1][i], preds[2][i]]
        # Count votes for each class (0 or 1)
        vote_counts = np.bincount(votes)
        # Get class with most votes (tie goes to higher class)
        majority_vote = np.argmax(vote_counts)
        ensemble_preds.append(majority_vote)

    # Convert to numpy array
    ensemble_preds = np.array(ensemble_preds)

    # Print classification report
    print("\nEnsemble Classification Report (Majority Voting):")
    print(classification_report(
        true_labels,
        ensemble_preds,
        target_names=["Fake (0)", "Real (1)"],
        digits=4
    ))

    return ensemble_preds

# Example usage:
# Assuming you have:
# - preds_model1, preds_model2, preds_model3 (prediction arrays from 3 models)
# - test_dataset (contains true labels)

# ensemble_preds = ensemble_predictions(
#     [preds_model1, preds_model2, preds_model3],
#     test_dataset
# )

In [None]:
print(test_df['label'].to_numpy())

[0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 

In [None]:
ensemble_predictions([pred_base, predictions_continual, predictions_scratch], test_df['label'].to_numpy())


Ensemble Classification Report (Majority Voting):
              precision    recall  f1-score   support

    Fake (0)     0.9515    0.8355    0.8897       705
    Real (1)     0.3830    0.7059    0.4966       102

    accuracy                         0.8191       807
   macro avg     0.6673    0.7707    0.6931       807
weighted avg     0.8797    0.8191    0.8400       807



array([0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [None]:
ensemble_preds = ensemble_evaluate(
    hybrid_model1=update12_scratch,
    hybrid_model2=update12_contiual,
    bert_model=base_bert_model_update1,
    hybrid_test_dataset1=hybrid_test_dataset1,
    hybrid_test_dataset2=hybrid_test_dataset2,
    bert_test_dataset=bert_test_dataset,
    hybrid_collate_fn=hybrid_collate_fn,
    bert_collate_fn=bert_collate_fn
)


In [None]:
training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        learning_rate=5e-6,
        per_device_train_batch_size=10,
        evaluation_strategy="steps",
        logging_steps=100,
        save_steps=100,
        remove_unused_columns=False,
    )

trainer = Trainer(
    model=model,  # model loaded from checkpoint
    args=training_args,
    train_dataset=train_dataset,  # new or combined dataset
    eval_dataset=test_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
100,1.5415,1.826881,0.832714,"[0.9033643521832498, 0.3778801843317972]","[[631, 74], [61, 41]]","{'0': {'precision': 0.911849710982659, 'recall': 0.8950354609929078, 'f1-score': 0.9033643521832498, 'support': 705.0}, '1': {'precision': 0.3565217391304348, 'recall': 0.4019607843137255, 'f1-score': 0.3778801843317972, 'support': 102.0}, 'accuracy': 0.8327137546468402, 'macro avg': {'precision': 0.6341857250565469, 'recall': 0.6484981226533166, 'f1-score': 0.6406222682575236, 'support': 807.0}, 'weighted avg': {'precision': 0.8416595584065415, 'recall': 0.8327137546468402, 'f1-score': 0.836946278923215, 'support': 807.0}}"
200,1.2218,1.874081,0.755886,"[0.845247446975648, 0.4222873900293255]","[[538, 167], [30, 72]]","{'0': {'precision': 0.9471830985915493, 'recall': 0.7631205673758865, 'f1-score': 0.845247446975648, 'support': 705.0}, '1': {'precision': 0.301255230125523, 'recall': 0.7058823529411765, 'f1-score': 0.4222873900293255, 'support': 102.0}, 'accuracy': 0.7558859975216853, 'macro avg': {'precision': 0.6242191643585362, 'recall': 0.7345014601585316, 'f1-score': 0.6337674185024867, 'support': 807.0}, 'weighted avg': {'precision': 0.8655416579675904, 'recall': 0.7558859975216853, 'f1-score': 0.7917878115251835, 'support': 807.0}}"
300,1.2553,1.774644,0.801735,"[0.8789712556732224, 0.4520547945205479]","[[581, 124], [36, 66]]","{'0': {'precision': 0.9416531604538088, 'recall': 0.8241134751773049, 'f1-score': 0.8789712556732224, 'support': 705.0}, '1': {'precision': 0.3473684210526316, 'recall': 0.6470588235294118, 'f1-score': 0.4520547945205479, 'support': 102.0}, 'accuracy': 0.8017348203221809, 'macro avg': {'precision': 0.6445107907532202, 'recall': 0.7355861493533584, 'f1-score': 0.6655130250968851, 'support': 807.0}, 'weighted avg': {'precision': 0.8665391041726189, 'recall': 0.8017348203221809, 'f1-score': 0.8250115542635906, 'support': 807.0}}"
400,1.0856,1.825145,0.755886,"[0.8440221694378464, 0.43874643874643876]","[[533, 172], [25, 77]]","{'0': {'precision': 0.9551971326164874, 'recall': 0.7560283687943262, 'f1-score': 0.8440221694378464, 'support': 705.0}, '1': {'precision': 0.3092369477911647, 'recall': 0.7549019607843137, 'f1-score': 0.43874643874643876, 'support': 102.0}, 'accuracy': 0.7558859975216853, 'macro avg': {'precision': 0.6322170402038261, 'recall': 0.75546516478932, 'f1-score': 0.6413843040921425, 'support': 807.0}, 'weighted avg': {'precision': 0.8735516073969299, 'recall': 0.7558859975216853, 'f1-score': 0.7927977276404194, 'support': 807.0}}"
500,1.2041,1.754822,0.801735,"[0.879154078549849, 0.4482758620689655]","[[582, 123], [37, 65]]","{'0': {'precision': 0.9402261712439418, 'recall': 0.825531914893617, 'f1-score': 0.879154078549849, 'support': 705.0}, '1': {'precision': 0.34574468085106386, 'recall': 0.6372549019607843, 'f1-score': 0.4482758620689655, 'support': 102.0}, 'accuracy': 0.8017348203221809, 'macro avg': {'precision': 0.6429854260475029, 'recall': 0.7313934084272007, 'f1-score': 0.6637149703094072, 'support': 807.0}, 'weighted avg': {'precision': 0.8650872468076672, 'recall': 0.8017348203221809, 'f1-score': 0.8246936348310756, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.9033643521832498, 0.3778801843317972]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[631, 74], [61, 41]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.911849710982659, 'recall': 0.8950354609929078, 'f1-score': 0.9033643521832498, 'support': 705.0}, '1': {'precision': 0.3565217391304348, 'recall': 0.4019607843137255, 'f1-score': 0.3778801843317972, 'support': 102.0}, 'accuracy': 0.8327137546468402, 'macro avg': {'precision': 0.6341857250565469, 'recall': 0.6484981226533166, 'f1-score': 0.6406222682575236, 'support': 807.0}, 'weighted avg': {'precision': 0.8416595584065415, 'recall': 0.8327137546468402, 'f1-scor

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       705
           1       0.36      0.40      0.38       102

    accuracy                           0.83       807
   macro avg       0.63      0.65      0.64       807
weighted avg       0.84      0.83      0.84       807

Confusion Matrix:
[[631  74]
 [ 61  41]]


Trainer is attempting to log a value of "[0.845247446975648, 0.4222873900293255]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[538, 167], [30, 72]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9471830985915493, 'recall': 0.7631205673758865, 'f1-score': 0.845247446975648, 'support': 705.0}, '1': {'precision': 0.301255230125523, 'recall': 0.7058823529411765, 'f1-score': 0.4222873900293255, 'support': 102.0}, 'accuracy': 0.7558859975216853, 'macro avg': {'precision': 0.6242191643585362, 'recall': 0.7345014601585316, 'f1-score': 0.6337674185024867, 'support': 807.0}, 'weighted avg': {'precision': 0.8655416579675904, 'recall': 0.7558859975216853, 'f1-score

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.85       705
           1       0.30      0.71      0.42       102

    accuracy                           0.76       807
   macro avg       0.62      0.73      0.63       807
weighted avg       0.87      0.76      0.79       807

Confusion Matrix:
[[538 167]
 [ 30  72]]


Trainer is attempting to log a value of "[0.8789712556732224, 0.4520547945205479]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[581, 124], [36, 66]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9416531604538088, 'recall': 0.8241134751773049, 'f1-score': 0.8789712556732224, 'support': 705.0}, '1': {'precision': 0.3473684210526316, 'recall': 0.6470588235294118, 'f1-score': 0.4520547945205479, 'support': 102.0}, 'accuracy': 0.8017348203221809, 'macro avg': {'precision': 0.6445107907532202, 'recall': 0.7355861493533584, 'f1-score': 0.6655130250968851, 'support': 807.0}, 'weighted avg': {'precision': 0.8665391041726189, 'recall': 0.8017348203221809, 'f1-sc

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.82      0.88       705
           1       0.35      0.65      0.45       102

    accuracy                           0.80       807
   macro avg       0.64      0.74      0.67       807
weighted avg       0.87      0.80      0.83       807

Confusion Matrix:
[[581 124]
 [ 36  66]]


Trainer is attempting to log a value of "[0.8440221694378464, 0.43874643874643876]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[533, 172], [25, 77]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9551971326164874, 'recall': 0.7560283687943262, 'f1-score': 0.8440221694378464, 'support': 705.0}, '1': {'precision': 0.3092369477911647, 'recall': 0.7549019607843137, 'f1-score': 0.43874643874643876, 'support': 102.0}, 'accuracy': 0.7558859975216853, 'macro avg': {'precision': 0.6322170402038261, 'recall': 0.75546516478932, 'f1-score': 0.6413843040921425, 'support': 807.0}, 'weighted avg': {'precision': 0.8735516073969299, 'recall': 0.7558859975216853, 'f1-sc

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.76      0.84       705
           1       0.31      0.75      0.44       102

    accuracy                           0.76       807
   macro avg       0.63      0.76      0.64       807
weighted avg       0.87      0.76      0.79       807

Confusion Matrix:
[[533 172]
 [ 25  77]]


Trainer is attempting to log a value of "[0.879154078549849, 0.4482758620689655]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[582, 123], [37, 65]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9402261712439418, 'recall': 0.825531914893617, 'f1-score': 0.879154078549849, 'support': 705.0}, '1': {'precision': 0.34574468085106386, 'recall': 0.6372549019607843, 'f1-score': 0.4482758620689655, 'support': 102.0}, 'accuracy': 0.8017348203221809, 'macro avg': {'precision': 0.6429854260475029, 'recall': 0.7313934084272007, 'f1-score': 0.6637149703094072, 'support': 807.0}, 'weighted avg': {'precision': 0.8650872468076672, 'recall': 0.8017348203221809, 'f1-scor

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.83      0.88       705
           1       0.35      0.64      0.45       102

    accuracy                           0.80       807
   macro avg       0.64      0.73      0.66       807
weighted avg       0.87      0.80      0.82       807

Confusion Matrix:
[[582 123]
 [ 37  65]]


TrainOutput(global_step=577, training_loss=1.2463309091339905, metrics={'train_runtime': 86.955, 'train_samples_per_second': 66.253, 'train_steps_per_second': 6.636, 'total_flos': 0.0, 'train_loss': 1.2463309091339905, 'epoch': 1.0})

In [None]:
model = HybridFakeNewsClassifierPrecomputed(
    bert_model_name='bert-base-uncased',
    entailment_threshold=0.8
)

update2_df = pd.concat([update3_df, update4_df], ignore_index=True)


headlines = headlines_update4
faiss_index = index_update4
train_samples = preprocess_hybrid_data(update2_df, k, entailment_model, entailment_tokenizer)
train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)


1000
2000


In [None]:
training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        learning_rate=5e-6,
        per_device_train_batch_size=10,
        evaluation_strategy="steps",
        logging_steps=100,
        save_steps=100,
        remove_unused_columns=False,
    )

trainer = Trainer(
    model=model,  # model loaded from checkpoint
    args=training_args,
    train_dataset=train_dataset,  # new or combined dataset
    eval_dataset=test_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
100,1.5266,0.726325,0.868649,"[0.922173274596182, 0.5793650793650794]","[[628, 77], [29, 73]]","{'0': {'precision': 0.9558599695585996, 'recall': 0.8907801418439716, 'f1-score': 0.922173274596182, 'support': 705.0}, '1': {'precision': 0.4866666666666667, 'recall': 0.7156862745098039, 'f1-score': 0.5793650793650794, 'support': 102.0}, 'accuracy': 0.8686493184634448, 'macro avg': {'precision': 0.7212633181126331, 'recall': 0.8032332081768878, 'f1-score': 0.7507691769806307, 'support': 807.0}, 'weighted avg': {'precision': 0.8965567268138943, 'recall': 0.8686493184634448, 'f1-score': 0.8788443577268233, 'support': 807.0}}"
200,1.2989,0.666901,0.883519,"[0.9329529243937232, 0.5566037735849056]","[[654, 51], [43, 59]]","{'0': {'precision': 0.9383070301291249, 'recall': 0.9276595744680851, 'f1-score': 0.9329529243937232, 'support': 705.0}, '1': {'precision': 0.5363636363636364, 'recall': 0.5784313725490197, 'f1-score': 0.5566037735849056, 'support': 102.0}, 'accuracy': 0.8835192069392813, 'macro avg': {'precision': 0.7373353332463806, 'recall': 0.7530454735085523, 'f1-score': 0.7447783489893145, 'support': 807.0}, 'weighted avg': {'precision': 0.8875037758985427, 'recall': 0.8835192069392813, 'f1-score': 0.885384630239449, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.922173274596182, 0.5793650793650794]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[628, 77], [29, 73]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9558599695585996, 'recall': 0.8907801418439716, 'f1-score': 0.922173274596182, 'support': 705.0}, '1': {'precision': 0.4866666666666667, 'recall': 0.7156862745098039, 'f1-score': 0.5793650793650794, 'support': 102.0}, 'accuracy': 0.8686493184634448, 'macro avg': {'precision': 0.7212633181126331, 'recall': 0.8032332081768878, 'f1-score': 0.7507691769806307, 'support': 807.0}, 'weighted avg': {'precision': 0.8965567268138943, 'recall': 0.8686493184634448, 'f1-score

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       705
           1       0.49      0.72      0.58       102

    accuracy                           0.87       807
   macro avg       0.72      0.80      0.75       807
weighted avg       0.90      0.87      0.88       807

Confusion Matrix:
[[628  77]
 [ 29  73]]


Trainer is attempting to log a value of "[0.9329529243937232, 0.5566037735849056]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[654, 51], [43, 59]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9383070301291249, 'recall': 0.9276595744680851, 'f1-score': 0.9329529243937232, 'support': 705.0}, '1': {'precision': 0.5363636363636364, 'recall': 0.5784313725490197, 'f1-score': 0.5566037735849056, 'support': 102.0}, 'accuracy': 0.8835192069392813, 'macro avg': {'precision': 0.7373353332463806, 'recall': 0.7530454735085523, 'f1-score': 0.7447783489893145, 'support': 807.0}, 'weighted avg': {'precision': 0.8875037758985427, 'recall': 0.8835192069392813, 'f1-sco

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       705
           1       0.54      0.58      0.56       102

    accuracy                           0.88       807
   macro avg       0.74      0.75      0.74       807
weighted avg       0.89      0.88      0.89       807

Confusion Matrix:
[[654  51]
 [ 43  59]]


TrainOutput(global_step=273, training_loss=1.363858121655363, metrics={'train_runtime': 44.9685, 'train_samples_per_second': 60.709, 'train_steps_per_second': 6.071, 'total_flos': 0.0, 'train_loss': 1.363858121655363, 'epoch': 1.0})

In [None]:
headlines = headlines_update3
faiss_index = index_update3
train_samples = preprocess_hybrid_data(update3_df, k, entailment_model, entailment_tokenizer)
train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)

1000
2000
3000


In [None]:
training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        learning_rate=5e-6,
        per_device_train_batch_size=10,
        evaluation_strategy="steps",
        logging_steps=100,
        save_steps=100,
        remove_unused_columns=False,
    )

trainer = Trainer(
    model=model,  # model loaded from checkpoint
    args=training_args,
    train_dataset=train_dataset,  # new or combined dataset
    eval_dataset=test_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
100,1.2262,0.633048,0.892193,"[0.9394572025052192, 0.5084745762711864]","[[675, 30], [57, 45]]","{'0': {'precision': 0.9221311475409836, 'recall': 0.9574468085106383, 'f1-score': 0.9394572025052192, 'support': 705.0}, '1': {'precision': 0.6, 'recall': 0.4411764705882353, 'f1-score': 0.5084745762711864, 'support': 102.0}, 'accuracy': 0.8921933085501859, 'macro avg': {'precision': 0.7610655737704918, 'recall': 0.6993116395494368, 'f1-score': 0.7239658893882028, 'support': 807.0}, 'weighted avg': {'precision': 0.8814156865134987, 'recall': 0.8921933085501859, 'f1-score': 0.8849835620146724, 'support': 807.0}}"
200,1.1871,0.622577,0.889715,"[0.9376313945339874, 0.5240641711229946]","[[669, 36], [53, 49]]","{'0': {'precision': 0.9265927977839336, 'recall': 0.948936170212766, 'f1-score': 0.9376313945339874, 'support': 705.0}, '1': {'precision': 0.5764705882352941, 'recall': 0.4803921568627451, 'f1-score': 0.5240641711229946, 'support': 102.0}, 'accuracy': 0.8897149938042132, 'macro avg': {'precision': 0.7515316930096139, 'recall': 0.7146641635377555, 'f1-score': 0.7308477828284909, 'support': 807.0}, 'weighted avg': {'precision': 0.8823394330082691, 'recall': 0.8897149938042132, 'f1-score': 0.8853589573742338, 'support': 807.0}}"
300,0.9565,0.615806,0.889715,"[0.9368346344925479, 0.5658536585365853]","[[660, 45], [44, 58]]","{'0': {'precision': 0.9375, 'recall': 0.9361702127659575, 'f1-score': 0.9368346344925479, 'support': 705.0}, '1': {'precision': 0.5631067961165048, 'recall': 0.5686274509803921, 'f1-score': 0.5658536585365853, 'support': 102.0}, 'accuracy': 0.8897149938042132, 'macro avg': {'precision': 0.7503033980582524, 'recall': 0.7523988318731748, 'f1-score': 0.7513441465145666, 'support': 807.0}, 'weighted avg': {'precision': 0.8901789259032014, 'recall': 0.8897149938042132, 'f1-score': 0.8899448457100099, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.9394572025052192, 0.5084745762711864]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[675, 30], [57, 45]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9221311475409836, 'recall': 0.9574468085106383, 'f1-score': 0.9394572025052192, 'support': 705.0}, '1': {'precision': 0.6, 'recall': 0.4411764705882353, 'f1-score': 0.5084745762711864, 'support': 102.0}, 'accuracy': 0.8921933085501859, 'macro avg': {'precision': 0.7610655737704918, 'recall': 0.6993116395494368, 'f1-score': 0.7239658893882028, 'support': 807.0}, 'weighted avg': {'precision': 0.8814156865134987, 'recall': 0.8921933085501859, 'f1-score': 0.88498356

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       705
           1       0.60      0.44      0.51       102

    accuracy                           0.89       807
   macro avg       0.76      0.70      0.72       807
weighted avg       0.88      0.89      0.88       807

Confusion Matrix:
[[675  30]
 [ 57  45]]


Trainer is attempting to log a value of "[0.9376313945339874, 0.5240641711229946]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[669, 36], [53, 49]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9265927977839336, 'recall': 0.948936170212766, 'f1-score': 0.9376313945339874, 'support': 705.0}, '1': {'precision': 0.5764705882352941, 'recall': 0.4803921568627451, 'f1-score': 0.5240641711229946, 'support': 102.0}, 'accuracy': 0.8897149938042132, 'macro avg': {'precision': 0.7515316930096139, 'recall': 0.7146641635377555, 'f1-score': 0.7308477828284909, 'support': 807.0}, 'weighted avg': {'precision': 0.8823394330082691, 'recall': 0.8897149938042132, 'f1-scor

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       705
           1       0.58      0.48      0.52       102

    accuracy                           0.89       807
   macro avg       0.75      0.71      0.73       807
weighted avg       0.88      0.89      0.89       807

Confusion Matrix:
[[669  36]
 [ 53  49]]


Trainer is attempting to log a value of "[0.9368346344925479, 0.5658536585365853]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[660, 45], [44, 58]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9375, 'recall': 0.9361702127659575, 'f1-score': 0.9368346344925479, 'support': 705.0}, '1': {'precision': 0.5631067961165048, 'recall': 0.5686274509803921, 'f1-score': 0.5658536585365853, 'support': 102.0}, 'accuracy': 0.8897149938042132, 'macro avg': {'precision': 0.7503033980582524, 'recall': 0.7523988318731748, 'f1-score': 0.7513441465145666, 'support': 807.0}, 'weighted avg': {'precision': 0.8901789259032014, 'recall': 0.8897149938042132, 'f1-score': 0.88994

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       705
           1       0.56      0.57      0.57       102

    accuracy                           0.89       807
   macro avg       0.75      0.75      0.75       807
weighted avg       0.89      0.89      0.89       807

Confusion Matrix:
[[660  45]
 [ 44  58]]


TrainOutput(global_step=378, training_loss=1.1234657247230488, metrics={'train_runtime': 55.8181, 'train_samples_per_second': 67.577, 'train_steps_per_second': 6.772, 'total_flos': 0.0, 'train_loss': 1.1234657247230488, 'epoch': 1.0})

In [None]:
headlines = headlines_update4
faiss_index = index_update4
train_samples = preprocess_hybrid_data(update4_df, k, entailment_model, entailment_tokenizer)
train_dataset = HybridFakeNewsPrecomputedDataset(train_samples)

1000


In [None]:
training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        learning_rate=5e-6,
        per_device_train_batch_size=10,
        evaluation_strategy="steps",
        logging_steps=100,
        save_steps=100,
        remove_unused_columns=False,
    )

trainer = Trainer(
    model=model,  # model loaded from checkpoint
    args=training_args,
    train_dataset=train_dataset,  # new or combined dataset
    eval_dataset=test_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1 Per Class,Confusion Matrix,Report
100,0.9532,0.620951,0.888476,"[0.9375, 0.4827586206896552]","[[675, 30], [60, 42]]","{'0': {'precision': 0.9183673469387755, 'recall': 0.9574468085106383, 'f1-score': 0.9375, 'support': 705.0}, '1': {'precision': 0.5833333333333334, 'recall': 0.4117647058823529, 'f1-score': 0.4827586206896552, 'support': 102.0}, 'accuracy': 0.8884758364312267, 'macro avg': {'precision': 0.7508503401360545, 'recall': 0.6846057571964956, 'f1-score': 0.7101293103448276, 'support': 807.0}, 'weighted avg': {'precision': 0.8760210403864148, 'recall': 0.8884758364312267, 'f1-score': 0.8800233944366107, 'support': 807.0}}"


Trainer is attempting to log a value of "[0.9375, 0.4827586206896552]" of type <class 'list'> for key "eval/f1_per_class" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[675, 30], [60, 42]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'0': {'precision': 0.9183673469387755, 'recall': 0.9574468085106383, 'f1-score': 0.9375, 'support': 705.0}, '1': {'precision': 0.5833333333333334, 'recall': 0.4117647058823529, 'f1-score': 0.4827586206896552, 'support': 102.0}, 'accuracy': 0.8884758364312267, 'macro avg': {'precision': 0.7508503401360545, 'recall': 0.6846057571964956, 'f1-score': 0.7101293103448276, 'support': 807.0}, 'weighted avg': {'precision': 0.8760210403864148, 'recall': 0.8884758364312267, 'f1-score': 0.8800233944366107,

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       705
           1       0.58      0.41      0.48       102

    accuracy                           0.89       807
   macro avg       0.75      0.68      0.71       807
weighted avg       0.88      0.89      0.88       807

Confusion Matrix:
[[675  30]
 [ 60  42]]


TrainOutput(global_step=169, training_loss=0.9968705939117973, metrics={'train_runtime': 23.5137, 'train_samples_per_second': 71.788, 'train_steps_per_second': 7.187, 'total_flos': 0.0, 'train_loss': 0.9968705939117973, 'epoch': 1.0})