In [None]:
import requests
import pandas as pd
import json
from datetime import datetime, timedelta

In [None]:
def fetch_fact_checks(query=None, max_results=100):
    """
    Fetch fact checks from Google Fact Check Explorer API
    """
    base_url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

    params = {
        "key":"<API KEY>",  # Replace with your actual API key
        "maxAgeDays": 30,  # Get fact checks from the last 30 days
        "pageSize": max_results,
        "languageCode": "en"
    }

    if query:
        params["query"] = query

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data: {response.status_code}")
        print(response.text)
        return None

def simplify_rating(rating_text):
    """
    Convert various rating texts to simple True/False
    """
    # Common false ratings
    false_ratings = [
        "false", "mostly false", "incorrect", "inaccurate", "misleading",
        "pants on fire", "fake", "fiction", "hoax", "conspiracy", "untrue"
    ]

    # Common true ratings
    true_ratings = [
        "true", "mostly true", "correct", "accurate", "fact", "verified"
    ]

    rating_lower = rating_text.lower()

    for false_term in false_ratings:
        if false_term in rating_lower:
            return "False"

    for true_term in true_ratings:
        if true_term in rating_lower:
            return "True"

    # For mixed, partly true/false, or unclear ratings
    return "Mixed"

def create_fact_check_dataset(queries=None):
    """
    Create a dataset of fact-checked headlines with True/False labels
    """
    if queries is None:
        queries = ["politics", "health", "economy", "climate", "technology"]

    all_claims = []

    for query in queries:
        print(f"Fetching fact checks for query: {query}")
        results = fetch_fact_checks(query)

        if results and "claims" in results:
            all_claims.extend(results["claims"])

    # Also get recent fact checks without specific query
    recent_results = fetch_fact_checks()
    if recent_results and "claims" in recent_results:
        all_claims.extend(recent_results["claims"])

    # Remove duplicates based on claim text
    unique_claims = []
    seen_claims = set()

    for claim in all_claims:
        if claim["text"] not in seen_claims:
            seen_claims.add(claim["text"])
            unique_claims.append(claim)

    # Create dataset
    data = []
    for claim in unique_claims:
        # Skip if no review or rating
        if "claimReview" not in claim or not claim["claimReview"]:
            continue

        review = claim["claimReview"][0]  # Take the first review

        if "textualRating" not in review:
            continue

        headline = claim["text"]
        rating_text = review["textualRating"]
        publisher = review.get("publisher", {}).get("name", "Unknown")
        review_url = review.get("url", "")
        date = review.get("reviewDate", "")

        simplified_rating = simplify_rating(rating_text)

        # Skip mixed ratings for a clean True/False dataset
        if simplified_rating == "Mixed":
            continue

        data.append({
            "headline": headline,
            "original_rating": rating_text,
            "rating": simplified_rating,
            "fact_checker": publisher,
            "review_url": review_url,
            "review_date": date
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Save to CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"fact_check_dataset_{timestamp}.csv"
    df.to_csv(filename, index=False)

    print(f"Dataset created with {len(df)} entries and saved as {filename}")
    print(f"True claims: {sum(df['rating'] == 'True')}")
    print(f"False claims: {sum(df['rating'] == 'False')}")

    return df

if __name__ == "__main__":
    # Define topics to search for
    topics = [
        "politics", "election", "economy", "climate",
        "health", "immigration",
        "war", "military", "technology", "social media"
    ]

    # Create the dataset
    dataset = create_fact_check_dataset(topics)

    # Display sample of the dataset
    print("\nSample of the dataset:")
    print(dataset[["headline", "rating"]].head(10))


Fetching fact checks for query: politics
Fetching fact checks for query: election
Fetching fact checks for query: economy
Fetching fact checks for query: climate
Fetching fact checks for query: health
Fetching fact checks for query: immigration
Fetching fact checks for query: war
Fetching fact checks for query: military
Fetching fact checks for query: technology
Fetching fact checks for query: social media
Error fetching data: 400
{
  "error": {
    "code": 400,
    "message": "Invalid request, must have either query or filter.",
    "status": "INVALID_ARGUMENT"
  }
}

Dataset created with 92 entries and saved as fact_check_dataset_20250403_165317.csv
True claims: 12
False claims: 80

Sample of the dataset:
                                            headline rating
0  BSP Chief Mayawati saying that she will retire...  False
1  Viral video shows IPS officer Shailjakant Mish...  False
2  "Pro-US/Trump political party won Greenland's ...  False
3  Banksy, the epic UK-based artist and pol