# Setup Installation

In [2]:
!pip install -q langchain langchain-community chromadb sentence-transformers spacy transformers accelerate bitsandbytes
!python -m spacy download xx_sent_ud_sm


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00

In [1]:
import os
import json
from typing import List, Dict

import pandas as pd
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


# Loading TXT File

In [2]:
# Step 1: Load and structure your facts from a TXT file

facts = []


# Each fact can span multiple lines, and facts are separated by a blank line.
with open("facts.txt", "r", encoding="utf-8") as f:
    content = f.read()

# Split on blank lines (two or more newlines) → each block = one fact
raw_blocks = [blk.strip() for blk in content.split("\n\n") if blk.strip()]

for i, block in enumerate(raw_blocks):
    # Join the lines inside each block into one single string
    fact_text = " ".join(
        line.strip() for line in block.splitlines() if line.strip()
    )

    facts.append({
        "id": f"fact_{i+1:03d}",   # auto ID: fact_001, fact_002, ...
        "source": "txt_file",      # you can change this later if needed
        "statement": fact_text
    })

print(f"Total facts loaded: {len(facts)}")
for f in facts[:5]:   # show first 5 as a sanity check
    print(f["id"], "→", f["statement"])


Total facts loaded: 35
fact_001 → Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the “Golden Jubilee Celebration of Indo-German Science & Technology Cooperation” followed by the Inauguration of Exhibition by the Hon’ble Minister.
fact_002 → Launch of Weather Forecast at Gram Panchayat level
fact_003 → Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the "Signing Ceremony of a Memorandum of Understanding (MoU) between the Department of Biotechnology (DBT) and Indian Space Research Organisation (ISRO)” for Cooperation in Space Biotechnology and Biomanufacturing followed by interaction with the Press/Media.
fact_004 → केंद्रीय मंत्री श्री धर्मेंद्र प्रधान कृषि, स्वास्थ्य और स्थाई शहरों से जुड़े विषयों पर 3 कृत्रिम बुद्धिमत्ता (एआई)- उत्कृष्टता क

In [3]:
print(facts)

[{'id': 'fact_001', 'source': 'txt_file', 'statement': 'Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the “Golden Jubilee Celebration of Indo-German Science & Technology Cooperation” followed by the Inauguration of Exhibition by the Hon’ble Minister.'}, {'id': 'fact_002', 'source': 'txt_file', 'statement': 'Launch of Weather Forecast at Gram Panchayat level'}, {'id': 'fact_003', 'source': 'txt_file', 'statement': 'Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the "Signing Ceremony of a Memorandum of Understanding (MoU) between the Department of Biotechnology (DBT) and Indian Space Research Organisation (ISRO)” for Cooperation in Space Biotechnology and Biomanufacturing followed by interaction with the Press/Media.'}, {'id': 'fact_004', 's

# Chroma Vector Store creation

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import shutil # Added for removing directory

# Directory on disk where Chroma will store the index
PERSIST_DIR = "fact_chroma_db"

# 1. Create embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# 2. Prepare data for Chroma
texts = [f["statement"] for f in facts]  # the actual fact sentences/paragraphs
metadatas = [{"id": f["id"], "source": f["source"]} for f in facts]
ids = [f["id"] for f in facts]

# 3. Create / load Chroma vector store
# Remove existing persistent directory to ensure dimension consistency
shutil.rmtree(PERSIST_DIR, ignore_errors=True)
vectorstore = Chroma(
    collection_name="gov_facts",
    embedding_function=embedding_model,
    persist_directory=PERSIST_DIR
)


# 4. Add your facts to the vector store
vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)
vectorstore.persist()

# 5. Create a retriever for later use
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

print("Vector store ready. Total facts embedded:", len(facts))


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vectorstore = Chroma(


Vector store ready. Total facts embedded: 35


  vectorstore.persist()


# Claim Extraction using spaCy

In [5]:
# Load English model
nlp = spacy.load("xx_sent_ud_sm")

def extract_claims(text: str) -> List[Dict]:
    doc = nlp(text)
    claims = []

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if len(sent_text) < 5:
            continue  # skip very short bits

        claims.append({
            "claim_text": sent_text,
            "entities": []  # we skip entities for now, Hindi NER is not covered here
        })

    return claims

In [6]:
# 🔎 Quick test
sample_text = "केंद्रीय मंत्री श्री धर्मेंद्र प्रधान कृषि, स्वास्थ्य और स्थाई शहरों से जुड़े विषयों पर 3 कृत्रिम बुद्धिमत्ता (एआई)- उत्कृष्टता केंद्रों का शुभारंभ करेंगे।"
claims = extract_claims(sample_text)
print("Extracted claims:")
for c in claims:
    print(" -", c["claim_text"], "| entities:", c["entities"])

Extracted claims:
 - केंद्रीय मंत्री श्री धर्मेंद्र प्रधान कृषि, स्वास्थ्य और स्थाई शहरों से जुड़े विषयों पर 3 कृत्रिम बुद्धिमत्ता (एआई)- उत्कृष्टता केंद्रों का शुभारंभ करेंगे। | entities: []


# Mistral 7B Instruct

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id
)

def call_llm(prompt: str) -> str:
    out = gen_pipe(prompt, max_new_tokens=512, do_sample=False)[0]["generated_text"]
    completion = out[len(prompt):].strip()
    return completion


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [8]:
# STEP 6 — Retrieval function

def retrieve_facts_for_claim(claim_text: str, k: int = 5):
    docs = retriever.get_relevant_documents(claim_text)
    results = []
    for d in docs:
        results.append({
            "id": d.metadata.get("id"),
            "source": d.metadata.get("source"),
            "statement": d.page_content
        })
    return results


In [9]:
import json
import re

def classify_claim_with_evidence(claim_text: str, retrieved_facts):
    if not retrieved_facts:
        return {
            "verdict": "Unverifiable",
            "evidence": [],
            "reasoning": "No relevant facts were retrieved from the trusted fact base."
        }

    facts_str = ""
    for f in retrieved_facts:
        facts_str += f"- {f['statement']} (id: {f['id']})\n"

    prompt = f"""
You are a STRICT fact-checking assistant.

TASK:
Compare the CLAIM only with the VERIFIED FACTS provided.
Decide if the claim is supported, contradicted, or not decidable.

Rules:
- If facts clearly SUPPORT the claim → verdict = "Likely True"
- If facts clearly CONTRADICT the claim → verdict = "Likely False"
- If facts are related but do NOT clearly support or contradict → verdict = "Unverifiable"

Return ONLY a JSON object in this exact format:

{{
  "verdict": "Likely True | Likely False | Unverifiable",
  "evidence": ["id1", "id2"],
  "reasoning": "short explanation"
}}

CLAIM:
{claim_text}

VERIFIED FACTS:
{facts_str}
"""

    raw = call_llm(prompt)
    # For debugging, you can temporarily print:
    # print("RAW MODEL OUTPUT:\n", raw)

    # Try direct JSON load
    try:
        data = json.loads(raw)
        return data
    except:
        pass

    # Try to extract a {...} block from the text
    try:
        match = re.search(r"\{.*\}", raw, re.DOTALL)
        if match:
            data = json.loads(match.group(0))
            return data
    except Exception as e:
        print("JSON parse error:", e)
        print("RAW OUTPUT (first 300 chars):", raw[:300])

    # Final fallback
    return {
        "verdict": "Unverifiable",
        "evidence": [],
        "reasoning": f"Could not parse JSON from model output. Raw: {raw[:200]}"
    }

In [10]:


def check_text(text: str):

    results = []
    claims = extract_claims(text)

    emoji_map = {
        "Likely True": "✅ True",
        "Likely False": "❌ False",
        "Unverifiable": "🤷‍♂️ Unverifiable"
    }

    for c in claims:
        claim_text = c["claim_text"]

        retrieved = retrieve_facts_for_claim(claim_text)
        verdict_data = classify_claim_with_evidence(claim_text, retrieved)

        # map evidence texts
        evidence_texts = []
        for ev in verdict_data.get("evidence", []):
            match = next((f for f in retrieved if f["id"] == ev), None)
            if match:
                evidence_texts.append(match["statement"])

        results.append({
            "claim": claim_text,
            "entities": c["entities"],
            "verdict": verdict_data["verdict"],
            "emoji_verdict": emoji_map.get(verdict_data["verdict"], "🤷‍♂️ Unverifiable"),
            "evidence_ids": verdict_data["evidence"],
            "evidence": evidence_texts,
            "reasoning": verdict_data["reasoning"]
        })

    return {"input_text": text, "results": results}


In [16]:
sample = "Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the “Golden Jubilee Celebration of Indo-German Science & Technology Cooperation” followed by the Inauguration of Exhibition by the Hon’ble Minister."

output = check_text(sample)

import pprint
pprint.pprint(output, width=120)


{'input_text': 'Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public '
               'Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the “Golden Jubilee '
               'Celebration of Indo-German Science & Technology Cooperation” followed by the Inauguration of '
               'Exhibition by the Hon’ble Minister.',
 'results': [{'claim': 'Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personnel, Public '
                       'Grievances, Pensions, Atomic Energy and Space, Dr. Jitendra Singh will attend the “Golden '
                       'Jubilee Celebration of Indo-German Science & Technology Cooperation” followed by the '
                       'Inauguration of Exhibition by the Hon’ble Minister.',
              'emoji_verdict': '✅ True',
              'entities': [],
              'evidence': ['Union Minister of State (Independent Charge) Science & Technology; MoS PMO, Personn