In [7]:
from googleapiclient.discovery import build
from newspaper import Article
import nltk
import json
import os
import time
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Fancrafter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
# Config
API_KEY = 'AIzaSyAbvEPFQpEcsiSc6baCEHBrc53rNVX9h6Q'
CSE_ID = '05360ec743a4b4090'

In [48]:
FACT = "Is it true the rumor that all Social Security beneficiaries will have to go to a location to verify their identity to receive their monthly check"

def google_search(query, api_key, cse_id, num=12):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=query, cx=cse_id, num=num).execute()
    return res.get('items', [])

def summarize_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        return article.summary
    except Exception as e:
        return None  # Mark invalid

def filter_top_articles(articles, top_k=8):
    # Score and select the best
    scored = []
    for a in articles:
        summary = a.get("summary", "")
        if summary and len(summary.strip()) >= 200:
            scored.append((len(summary.strip()), a))
    # Sort by length of summary (basic quality metric)
    print(scored)
    # scored.sort(reverse=True)
    return [a for _, a in scored[:top_k]]

def fact_checker_and_save(fact, json_filename="fact_results.json"):
    print(f"🔍 Searching: {fact}")
    raw_articles = google_search(fact, API_KEY, CSE_ID, num=10)

    enriched = []
    for result in raw_articles:
        url = result.get("link")
        title = result.get("title")
        snippet = result.get("snippet")
        summary = summarize_article(url)

        if summary:
            enriched.append({
                "title": title,
                "url": url,
                "snippet": snippet,
                "summary": summary
            })

        time.sleep(1)  # politeness delay

    best_articles = filter_top_articles(enriched, top_k=8)

    result = {
        "fact": fact,
        "articles": best_articles
    }

    # Save to JSON
    with open(json_filename, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved top {len(best_articles)} articles to {json_filename}")

# Run it
fact_checker_and_save(FACT)

🔍 Searching: Is it true the rumor that all Social Security beneficiaries will have to go to a location to verify their identity to receive their monthly check
[(877, {'title': 'What Are the ID Verification Changes for Social Security ...', 'url': 'https://www.factcheck.org/2025/04/what-are-the-id-verification-changes-for-social-security/', 'snippet': '4 days ago ... ... true the rumor that all Social Security beneficiaries will have to go to a location to verify their identity to receive their monthly check?', 'summary': 'Q: Is it true the rumor that all Social Security beneficiaries will have to go to a location to verify their identity to receive their monthly check?\nFull AnswerUpdate, April 11: Multiple news organizations have reported that the Social Security Administration has canceled its plan to end identity verification over the phone for those applying for retirement and survivor benefits.\n“Eliminating most applications by phone will close off an important mode of service fo

In [49]:
from dotenv import load_dotenv
import os
import json
from google import genai
from google.genai import types
import re

# Load environment variables from .env file
load_dotenv()

# Load your JSON file (Fact + Articles)
def load_fact_json(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        return json.load(f)

# Prepare structured prompt input based on fact + summaries
def build_input_from_json(fact_data):
    fact = fact_data["fact"]
    articles = fact_data["articles"]

    input_str = f"**Factual Claim:**\n{fact}\n\n"
    input_str += "**Relevant Articles:**\n"

    for i, article in enumerate(articles, 1):
        input_str += f"\nArticle {i}:\n"
        input_str += f"- **Title:** {article['title']}\n"
        input_str += f"- **URL:** {article['url']}\n"
        input_str += f"- **Snippet:** {article['snippet']}\n"
        input_str += f"- **Summary:** {article['summary']}\n"

    return input_str

# Run Gemini API with the filled-in prompt
def run_fact_check(prompt_text, fact_text, full_output_file="full_output.json", parsed_output_file="parsed_output.json"):
    client = genai.Client(
        api_key=API_KEY,
    )

    model = "gemini-2.0-flash"

    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=prompt_text)],
        ),
    ]

    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        response_mime_type="text/plain",
        system_instruction=[
            types.Part.from_text(text="""You are an automated fact-verification assistant. You will be given a structured input that contains:

- A factual claim at the top
- A list of summarized news articles that are relevant to that claim. Each article includes:
  - Title
  - URL
  - A short snippet
  - A full summary of the article's content

Your task is to analyze the provided article summaries and assess how well they support or contradict the factual claim.

You must provide the following in your response:

1. **Verdict** – Choose only **"True"** or **"False"** based on your evaluation of the evidence.

2. **Collective Summary** – A short synthesis of what the articles collectively say. Focus on:
   - Overall agreement or disagreement with the fact
   - Whether the evidence is strong, partial, mixed, or weak
   - Any outliers or conflicting perspectives

3. **Reasoning** – Provide justification for your verdict in 2–4 sentences. Refer directly to article patterns (e.g., "5 out of 7 articles support the claim that CO2 emissions are a leading cause of accelerated climate change").

4. **Sources Summary** – Bullet-point list of all article titles with a one-line comment on how each relates to the fact (e.g., supports, contradicts, or provides background).

Formatting Rules:
- Start your output with: **Verdict: True** or **Verdict: False**
- Be objective and analytical — do not speculate
- Use markdown for readability
- Keep your total response under 200 words unless otherwise instructed
- Do not perform live search or external lookups — rely only on the provided content

Your role is to emulate a professional fact-checking analyst using summarized content from multiple sources to reach a binary decision.
""")
        ],
    )

    print("🔍 Submitting to Gemini...\n")

    full_response = ""
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        print(chunk.text, end="")
        full_response += chunk.text

    # Save the full markdown-style output
    with open(full_output_file, "w", encoding="utf-8") as f:
        json.dump({
            "fact": fact_text,
            "gemini_output": full_response.strip()
        }, f, ensure_ascii=False, indent=2)

    # Parse response into parts using regular expressions
    parsed = {
        "fact": fact_text,
        "verdict": extract_section("Verdict", full_response),
        "collective_summary": extract_section("Collective Summary", full_response),
        "reasoning": extract_section("Reasoning", full_response),
        "sources_summary": extract_bullet_list("Sources Summary", full_response)
    }

    # Save parsed version
    with open(parsed_output_file, "w", encoding="utf-8") as f:
        json.dump(parsed, f, ensure_ascii=False, indent=2)

    print(f"\n\n✅ Output saved to {full_output_file} and {parsed_output_file}")

def extract_section(header, text):
    pattern = rf"\*\*{header}:\*\*\s*(.*?)(?=\n\*\*|$)"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

def extract_bullet_list(header, text):
    section = extract_section(header, text)
    if section:
        bullets = re.findall(r"[*\-]\s+(.*)", section)
        return [b.strip() for b in bullets]
    return []


In [50]:
# Driver function
def verify_fact_from_file(json_path):
    data = load_fact_json(json_path)
    fact = data["fact"]
    filled_prompt = build_input_from_json(data)
    run_fact_check(filled_prompt, fact)

# Example usage
if __name__ == "__main__":
    # Replace with your actual path to the JSON file
    verify_fact_from_file("fact_results.json")

🔍 Submitting to Gemini...

**Verdict: False**

**Collective Summary:**
The articles indicate that the Social Security Administration (SSA) considered changes to identity verification processes, including potentially limiting phone verification and requiring online or in-person verification for certain actions like changing bank account information. However, the SSA has since canceled the plan to end phone verification for those applying for retirement and survivor benefits. The changes were aimed at reducing fraud.

**Reasoning:**
Article 1 directly addresses the claim and states that the rumor about all beneficiaries needing to verify in person is false, as the SSA canceled the plan. Articles 4 and 8 mention changes to verification processes, but these are related to specific actions like changing bank information or applying for benefits, not a blanket requirement for all beneficiaries to verify in person to receive their monthly checks.

**Sources Summary:**
*   **What Are the ID Ve

In [None]:
from google.genai import types

In [3]:
import google
print(google.__file__)

None


In [51]:
import json

file_path = "data/politifact_factcheck_data.json"

def print_sample_lines(filepath, max_lines=10):
    print(f"\n🔍 Reading up to {max_lines} lines from: {filepath}\n")
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if i >= max_lines:
                    break
                try:
                    parsed = json.loads(line.strip())
                    print(f"[{i}] statement: {parsed.get('statement')}")
                except json.JSONDecodeError as e:
                    print(f"[{i}] ❌ JSON error: {e}")
                    print(f"Raw line: {line}")
    except FileNotFoundError:
        print(f"❌ File not found: {filepath}")

if __name__ == "__main__":
    print_sample_lines(file_path)


🔍 Reading up to 10 lines from: data/politifact_factcheck_data.json

[0] statement: John McCain opposed bankruptcy protections for families "who were only in bankruptcy because of medical expenses they couldn't pay."
[1] statement: "Bennie Thompson actively cheer-led riots in the ’90s."
[2] statement: Says Maggie Hassan was "out of state on 30 days over the last three months."
[3] statement: "BUSTED: CDC Inflated COVID Numbers, Accused of Violating Federal Law"
[4] statement: "I'm the only (Republican) candidate that has actually reduced the size of government."
[5] statement: "There are actually only 30 countries that practice birthright citizenship."
[6] statement: "My husband and I have never gotten a penny of money from the farm."
[7] statement: "If you go strictly by the numbers, crime is down across the board. Last year we had a 10 percent decrease in the most serious crimes."
[8] statement: "The American people say, don't touch Social Security, don't touch Medicare, don't cut de