## Preprocessing

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


def remove_html_tags(raw_html):
    return BeautifulSoup(raw_html, "html.parser").get_text()

def remove_special_chars(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def remove_stopwords(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in tokens if word.lower() not in stop_words])

def normalize(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def preprocess_text(raw_html):
    text = remove_html_tags(raw_html)
    text = normalize(text)
    text = remove_special_chars(text)
    text = remove_stopwords(text)
    return text

# test
if __name__ == "__main__":
    sample_html = "<html><body><h1>This is a test</h1><p>Hello world!</p></body></html>"
    print(preprocess_text(sample_html))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


testhello world


### converting audio to text

In [None]:
!pip install -U openai-whisper

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/803.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [None]:
import os

os.makedirs("images", exist_ok=True)
os.makedirs("audios", exist_ok=True)

In [None]:
from google.colab import files

uploaded = files.upload()

Saving news.mp3 to news (1).mp3


In [None]:
import shutil

shutil.move("news.mp3", "audios/news.mp3")

'audios/news.mp3'

In [None]:
import whisper
import os

def transcribe_audio(file_path):
    model = whisper.load_model("base")  # you can change to "small", "medium", etc.
    print(f"Transcribing: {file_path} ...")
    result = model.transcribe(file_path)
    return result["text"]

def process_audio_folder(folder_path, output_folder="transcriptions"):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".mp3", ".wav", ".m4a")):
            file_path = os.path.join(folder_path, filename)
            transcript = transcribe_audio(file_path)

            base_name = os.path.splitext(filename)[0]
            output_file = os.path.join(output_folder, base_name + ".txt")
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(transcript)

    print("All audio files transcribed.")

# Example usage
if __name__ == "__main__":
    process_audio_folder("audios")

100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 46.6MiB/s]


Transcribing: audios/news.mp3 ...




All audio files transcribed.


### applying OCR with images

In [None]:
uploaded = files.upload()

Saving fake_news.jpg to fake_news.jpg


In [None]:
shutil.move("fake_news.jpg", "images/fake_news.jpg")

'images/fake_news.jpg'

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
from PIL import Image
import pytesseract
import os

def ocr_image(image_path):
    try:
        text = pytesseract.image_to_string(Image.open(image_path))
        return text.strip()
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

def process_folder(folder_path, output_folder="ocr_results"):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg", ".tiff")):
            image_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")
            extracted_text = ocr_image(image_path)

            if extracted_text:
                base_name = os.path.splitext(filename)[0]
                output_file = os.path.join(output_folder, base_name + ".txt")
                with open(output_file, "w", encoding="utf-8") as f:
                    f.write(extracted_text)

    print("OCR completed for all images.")

# Example usage
if __name__ == "__main__":
    process_folder("images")

Processing: fake_news.jpg
OCR completed for all images.


## LLM

In [None]:
!pip install groq wikipedia-api ddgs

Collecting ddgs
  Downloading ddgs-9.2.3-py3-none-any.whl.metadata (16 kB)
Downloading ddgs-9.2.3-py3-none-any.whl (30 kB)
Installing collected packages: ddgs
Successfully installed ddgs-9.2.3


In [None]:
GROQ_API_KEY = ""

In [None]:
from groq import Groq

class QwenChatbot:
  def __init__(self, api_key):
    self.client = Groq(
        api_key=api_key,
    )

  def summarize_prompt(self, claims):
    completion = self.client.chat.completions.create(
        model="qwen/qwen3-32b",
        messages=[
            {
                "role": "system",
                "content": (
                  "You are a fact-checking assistant. When the user makes a claim or shares news, "
                  "summarize it for the user and rephrase it to a search prompt to be used in the search engine."
                  "You only need to return the search prompt, do not reply anything unrelevant"
                ),
            },
            {
                "role": "user",
                "content": claims,
            }
        ],
        temperature=0.2,
        max_completion_tokens=4096,
        top_p=0.95,
        reasoning_format="hidden"
    )
    return completion.choices[0].message.content

  def check_truthiness(self, search_results, claim):
    # system prompt
    grading_prompt = """
    You are tasked with evaluating the truthfulness of a given input statement.
    Assign a numerical score from 0 to 5 based on the following grading scale:

    5 – Completely True: All claims are verifiable, accurate, and supported by reliable evidence.
    4 – Mostly True: Minor inaccuracies may exist, but the core facts are accurate and not misleading.
    3 – Half True: Roughly an equal mix of accurate and inaccurate or misleading information.
    2 – Mostly False: A small element of truth exists, but the claim is mostly inaccurate or misrepresented.
    1 – Completely False: The statement is entirely inaccurate, fabricated, or contradicted by reliable sources.
    0 – Not Evaluated: There is insufficient information to determine the truthfulness of the statement.

    Your task:
    1. Assign a score (0–5).
    2. Provide a concise explanation.
    3. Clearly state which parts of the claim are TRUE and which are FALSE.
    4. For each true/false part, cite supporting links from the following search results.
    """

    completion = self.client.chat.completions.create(
        model="qwen/qwen3-32b",
        messages=[
            {
                "role": "system",
                "content": grading_prompt
            },
            {
                "role": "user",
                "content": f"Here is the claim from the user: {claim} and here is the search results: {search_results}. Cite supporting links from the following search results only"
            }
        ],
        temperature=0.5,
        reasoning_format="hidden"
    )
    return completion.choices[0].message.content


## Web Search

In [None]:

import requests
from bs4 import BeautifulSoup
import wikipediaapi

# Replace with your API keys
GOOGLE_FACT_CHECK_API_KEY = "YOUR_GOOGLE_API_KEY"
GNEWS_API_KEY = "YOUR_GNEWS_API_KEY"

def verify_with_google_fact_check(claim):
    url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={claim}&key={GOOGLE_FACT_CHECK_API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        claims = data.get("claims", [])
        return [c["text"] + " - " + c["claimReview"][0]["textualRating"] for c in claims] if claims else ["No result."]
    return ["Google Fact Check API error."]

def verify_with_gnews(claim):
    url = f"https://gnews.io/api/v4/search?q={claim}&token={GNEWS_API_KEY}&lang=en"
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get("articles", [])
        return [f"{a['title']} - {a['source']['name']}" for a in articles[:3]]
    return ["GNews API error."]

def verify_with_wikipedia(claim):
    wiki = wikipediaapi.Wikipedia(
        user_agent="FactCheckBot/1.0 (contact: youremail@example.com)",
        language="en"
    )
    page = wiki.page(claim)
    return [page.summary[:500]] if page.exists() else ["No Wikipedia match."]


def verify_with_snopes(claim):
    search_url = f"https://www.snopes.com/?s={claim.replace(' ', '+')}"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    results = soup.select("article h2.entry-title a")
    return [link.text.strip() for link in results[:3]] if results else ["No Snopes result."]

def verify_claim(claim):
    print(f"\n🔎 Verifying Claim: \"{claim}\"\n")

    sources = {
        "Google Fact Check": verify_with_google_fact_check(claim),
        "Wikipedia Summary": verify_with_wikipedia(claim),
        "GNews Articles": verify_with_gnews(claim),
        "Snopes Results": verify_with_snopes(claim)
    }

    for source, results in sources.items():
        print(f"{source}:")
        for r in results:
            print("  •", r)
        print()
    return sources



In [None]:
from ddgs import DDGS

def verify_with_duckduckgo(query, max_results=5):
    results = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, max_results=max_results):
            title = r.get("title", "")
            snippet = r.get("body", "")
            url = r.get("href", "")
            results.append(f"{title}: {snippet} (Source: {url})")
    return results

## Fact Check Pipeline

In [None]:
def fact_check_pipeline(user_input):
    chatbot = QwenChatbot(api_key=GROQ_API_KEY)

    print("\n Summarizing input and generating search prompt...\n")
    search_prompt = chatbot.summarize_prompt(user_input)
    print("Search Prompt:\n", search_prompt, "\n")

    sources = {
        "DuckDuckGo": verify_with_duckduckgo(search_prompt)
    }

    combined_evidence = ""
    for source, entries in sources.items():
        combined_evidence += f"\n{source}:\n"
        for item in entries:
            combined_evidence += f"• {item}\n"

    print("Combined Evidence:\n", combined_evidence, "\n")
    print("Evidence collected. Evaluating truthfulness...\n")
    evaluation = chatbot.check_truthiness(combined_evidence, user_input)
    print("Evaluation Result:\n", evaluation)

    return evaluation

# Test
test_claim = "COVID-19 vaccines cause infertility"
fact_check_pipeline(test_claim)


 Summarizing input and generating search prompt...

Search Prompt:
 "COVID-19 vaccines and infertility: scientific studies or health organization statements on potential link" 

Combined Evidence:
 
DuckDuckGo:
• The impact of COVID-19 vaccines on fertility-A systematic ...: by D Zaçe · 2022 · Cited by 87 — Based on the studies published so far, there is no scientific proof of any association between COVID-19 vaccines and fertility impairment in men or women. (Source: https://pmc.ncbi.nlm.nih.gov/articles/PMC9464596/)
• COVID-19 Vaccination for People Who Would Like to Have ...: 10 Sept 2024 — Despite these temporary changes in menstruation, there is no evidence that COVID-19 vaccines cause fertility problems. Research Studies of ... (Source: https://www.cdc.gov/covid/vaccines/planning-for-pregnancy.html)
• Associations between inactivated COVID-19 vaccination ...: by D Liu · 2025 — This research fills critical knowledge gaps regarding the impact of inactivated COVID-19 vaccines on IV

'<think>\nOkay, let\'s tackle this. The user is claiming that COVID-19 vaccines cause infertility. My job is to evaluate this based on the provided sources.\n\nFirst, I\'ll go through each of the search results they provided. The first one from PMC says there\'s no scientific proof of any association between the vaccines and fertility impairment in men or women. That\'s a solid starting point. The CDC article from 2024 also mentions no evidence of fertility problems, except for temporary menstrual changes. Another study from 2025 on inactivated vaccines and IVF outcomes didn\'t find negative impacts. The AMA article confirms vaccines don\'t alter fertility, though it mentions a study on temporary menstrual changes. Finally, Mayo Clinic debunking the myth directly.\n\nPutting it all together: All the sources consistently state there\'s no evidence linking the vaccines to infertility. The user\'s claim is entirely false. The only minor note is some temporary menstrual changes, but that\'

In [None]:
def fact_check_image(file_path):
  # Read the OCR output
  with open("ocr_results/fake_news.txt", "r", encoding="utf-8") as f:
      ocr_text = f.read()

  # Run fact-checking on the extracted text
  fact_check_pipeline(ocr_text)

# test with image
file_path = "ocr_results/fake_news.txt"
fact_check_image(file_path)


 Summarizing input and generating search prompt...

Search Prompt:
 <think>
Okay, let's try to figure out what the user is asking here. The input looks like a bunch of random letters and maybe some typos. "celel ele eT ee Be a a" and "KEE Arocatrese meow!" don't make much sense at first glance. The user might have made a mistake while typing, or maybe they're trying to test the system with gibberish.

First, I'll check if there's any coherent message hidden in the jumble. The first part "celel ele eT ee Be a a" could be a typo. Maybe they meant "Check this out, here's a message"? Not sure. The second line "KEE Arocatrese meow!" has "KEE" which might be a typo for "Keep" but not sure. "Arocatrese" doesn't seem like a real word. "Meow!" is clearly the sound a cat makes, so maybe they're referring to a cat-related joke or meme.

The user might be sending some kind of encrypted message or a test to see if the AI can handle nonsense. Alternatively, they could be using a cipher or random le

  with DDGS() as ddgs:


Combined Evidence:
 
DuckDuckGo:
• THINK Definition & Meaning - Merriam-Webster: The meaning of THINK is to form or have in the mind. How to use think in a sentence. Synonym Discussion of Think. (Source: https://www.merriam-webster.com/dictionary/think)
• THINK | English meaning - Cambridge Dictionary: THINK definition: 1. to believe something or have an opinion or idea: 2. to have a low opinion of someone or…. Learn more. (Source: https://dictionary.cambridge.org/dictionary/english/think)
• Think - definition of think by The Free Dictionary: 1. To have or formulate in the mind: Think the happiest thought you can think. 2. a. To reason about or reflect on; ponder: Think how complex language is. Think the matter through. b. To … (Source: https://www.thefreedictionary.com/think)
• THINK definition and meaning | Collins English Dictionary: If you say that you think that something is true or will happen, you mean that you have the impression that it is true or will happen, although you are

In [None]:
def fact_check_audio(file_path):
  # Read transcription
  with open(file_path, "r", encoding="utf-8") as f:
      transcription = f.read()

  # Run fact-checking on the transcription
  fact_check_pipeline(transcription)

# test with image
file_path = "transcriptions/news.txt"
fact_check_audio(file_path)


 Summarizing input and generating search prompt...

Search Prompt:
 <think>
Okay, let's break down the user's query. They provided a detailed text about coffee's history, impact, and effects. My task is to summarize this and rephrase into a search prompt for fact-checking.

First, I need to identify the main claims. The user mentions several points: coffee's history starting in Ethiopia, the legend about a goat herder in the 9th century, its role in the Enlightenment via European coffee houses, the spread of coffee houses to Europe in the 15th century, the connection between coffee and the slave trade, the economic and social impact leading to capitalism, caffeine's effects on the body, and health benefits like reduced risk for diseases.

Now, I need to check which of these claims are accurate and which might need verification. The Ethiopian origin of coffee is well-established, but the goat herder story is a common legend. The association with the Enlightenment might be an overstatem

  with DDGS() as ddgs:


Combined Evidence:
 
DuckDuckGo:
• THINK Definition & Meaning - Merriam-Webster: The meaning of THINK is to form or have in the mind. How to use think in a sentence. Synonym Discussion of Think. (Source: https://www.merriam-webster.com/dictionary/think)
• THINK | English meaning - Cambridge Dictionary: THINK definition: 1. to believe something or have an opinion or idea: 2. to have a low opinion of someone or…. Learn more. (Source: https://dictionary.cambridge.org/dictionary/english/think)
• Think - definition of think by The Free Dictionary: 1. To have or formulate in the mind: Think the happiest thought you can think. 2. a. To reason about or reflect on; ponder: Think how complex language is. Think the matter through. b. To … (Source: https://www.thefreedictionary.com/think)
• THINK definition and meaning | Collins English Dictionary: If you say that you think that something is true or will happen, you mean that you have the impression that it is true or will happen, although you are