In [None]:
import re, faiss, torch, pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from peft import PeftModel
from sklearn.metrics import classification_report

In [None]:
INPUT_CSV          = "politifact-sample.csv"           # CSV must have a 'title' column
OUTPUT_CSV         = "predictions.csv"            # where results will be saved
K_RETRIEVE         = 3                            # top‑k similar headlines



In [None]:
from huggingface_hub import login
login("<replace with Access Token>")


In [None]:

print("Loading similarity encoder & FAISS index …")
sim_model   = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
faiss_index = faiss.read_index("faiss_live_headlines.index")


headlines_df = pd.read_csv("live_data_retrieval_headlines.csv")
trusted_headlines = headlines_df['Headline'].dropna().tolist()

def get_top_k_similar_headlines(query_headline: str, k: int = 3):
    """Return list[str] of top‑k trusted headlines most similar to query."""
    query_emb = sim_model.encode([query_headline.strip().lower()], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)
    _, idxs   = faiss_index.search(query_emb, k)
    return [trusted_headlines[i] for i in idxs[0] if i != -1]


Loading similarity encoder & FAISS index …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
! pip install --upgrade "transformers>=4.40" bitsandbytes accelerate


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.6.0-py3-none-any.whl (354 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.5.2
    Uninstalling accelerate-1.5.2:
      Successfully uninstalled accelerate-1.5.2
Successfully installed accelerate-1.6.0 bitsandbytes-0.45.5


In [None]:



BASE_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
ADAPTER = "Phoenyx83/Meta-Llama-3-8B-Politifact-fake-news"

tokenizer = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(
    BASE_ID, torch_dtype=torch.float16, device_map="auto", load_in_4bit=True)
model = PeftModel.from_pretrained(base, ADAPTER)
model.merge_and_unload()

gen_cfg = GenerationConfig(max_new_tokens=2, temperature=0.0, do_sample=False)





The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [None]:
POSITIVE = {"true", "mostly true", "half true", 'labeltrue','labeltrusted', 'assistanttrue'  }
NEGATIVE = {"barely true", "false", "pants on fire" ,'labelfalse',  'assistantfalse' }

def politifact_to_binary(label: str) -> int:
    label = re.sub(r"[^\w ]", "", label.lower()).strip()
    if label in POSITIVE or label == 'labeltrue' or label == 'labeltrusted':
        return 1
    if label in NEGATIVE or label == 'labelfalse':
        return 0
    print(label)
    return 0                 # unknown / parse failure


In [None]:

SYSTEM = ("You are a fact‑checking assistant. "
          "Return exactly one label: "
          "True, False.")


def build_prompt(headline, evidence):
    ev_block = "\n".join(f"{i+1}. {h}" for i, h in enumerate(evidence)) or \
               "No similar trusted headlines found."
    return (
        f"<|system|>{SYSTEM}\n"
        f"<|user|>\n"
        f"Headline:\n{headline}\n"
        #f"Trusted similar headlines:\n{ev_block}\n"  #Uncomment to add retreival
        f"<|assistant|>"
    )
    # return (f"<|system|>{SYSTEM}<|user|>\n"
    #         f"Headline: \"{headline}\"\n\nLabel:"
    #         #f"Trusted similar headlines:\n{ev_block}\n\nLabel:"
    #         )

def classify(headline, k=K_RETRIEVE):
    evidence = get_top_k_similar_headlines(headline, k)
    inputs = tokenizer(build_prompt(headline, evidence), return_tensors="pt").to(model.device)
    out = model.generate(**inputs, generation_config=gen_cfg)
    label_text = tokenizer.decode(out[0], skip_special_tokens=True).split()[-1]
    return politifact_to_binary(label_text)




In [None]:
    df = pd.read_csv(INPUT_CSV).head(30)
    if "title" not in df.columns:
        raise ValueError(f"Input CSV '{INPUT_CSV}' must have a 'Title' column.")
    print(f"Classifying {len(df)} headlines …")
    df["pred_label"] = df["title"].apply(classify)

    if "label" in df.columns:
        print("\nEvaluation against ground truth:")
        print(classification_report(df["label"], df["pred_label"],
                                    target_names=["fake (0)", "real (1)"], digits=3))
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved predictions to {OUTPUT_CSV}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Classifying 30 headlines …


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for


Evaluation against ground truth:
              precision    recall  f1-score   support

    fake (0)      0.885     0.852     0.868        27
    real (1)      0.000     0.000     0.000         3

    accuracy                          0.767        30
   macro avg      0.442     0.426     0.434        30
weighted avg      0.796     0.767     0.781        30

Saved predictions to predictions.csv


In [None]:
print(df['pred_label'].value_counts())

pred_label
0    261
1     18
Name: count, dtype: int64
