In [2]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4


In [3]:
import asyncio
import json
import os
import urllib
from datetime import datetime
from typing import Dict, List, Any, Optional
import requests
import re
from bs4 import BeautifulSoup
from gtts import gTTS
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

# ============================ URL VALIDITY EVALUATION ============================

def evaluate_url_authenticity(search_query: str, webpage_url: str) -> dict:
    """
    Assesses the credibility of a given URL by analyzing domain reputation, content alignment, factual accuracy,
    potential bias, and citation frequency.

    Args:
        search_query (str): The user's original search input.
        webpage_url (str): The URL being assessed.

    Returns:
        dict: A dictionary containing scores for various credibility metrics.
    """

    # === Step 1: Retrieve Webpage Content ===
    try:
        response = requests.get(webpage_url, timeout=10)
        response.raise_for_status()
        parsed_html = BeautifulSoup(response.text, "html.parser")
        extracted_text = " ".join([p.text for p in parsed_html.find_all("p")])
    except Exception as error:
        return {"error": f"Unable to retrieve content: {str(error)}"}

    # === Step 2: Assess Domain Reputation (Placeholder for Moz API) ===
    domain_reliability_score = 60  # Placeholder value on a scale of 0-100

    # === Step 3: Measure Content Alignment (Semantic Similarity) ===
    transformer_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    relevance_score = util.pytorch_cos_sim(
        transformer_model.encode(search_query),
        transformer_model.encode(extracted_text)
    ).item() * 100

    # === Step 4: Validate Facts (Google Fact Check API) ===
    factual_accuracy_score = verify_claims(extracted_text)

    # === Step 5: Detect Potential Bias (Sentiment Analysis) ===
    sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
    sentiment_result = sentiment_analyzer(extracted_text[:512])[0]  # Process the first 512 characters only
    bias_tendency_score = 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30

    # === Step 6: Evaluate Citation Frequency (Google Scholar via SerpAPI) ===
    reference_count = fetch_citation_count(webpage_url)
    citation_reliability_score = min(reference_count * 10, 100)  # Normalize within 0-100 range

    # === Step 7: Compute Overall Credibility Score ===
    aggregate_credibility_score = (
        (0.3 * domain_reliability_score) +
        (0.3 * relevance_score) +
        (0.2 * factual_accuracy_score) +
        (0.1 * bias_tendency_score) +
        (0.1 * citation_reliability_score)
    )

    return {
        "Domain Reliability Score": domain_reliability_score,
        "Content Alignment Score": relevance_score,
        "Factual Accuracy Score": factual_accuracy_score,
        "Bias Tendency Score": bias_tendency_score,
        "Citation Reliability Score": citation_reliability_score,
        "Overall Credibility Score": aggregate_credibility_score
    }

# === Helper Function: Validate Claims via Google Fact Check API ===

def verify_claims(content_text: str) -> int:
    """
    Cross-references the extracted text with the Google Fact Check API.
    Returns a credibility score between 0-100 based on factual verification.
    """
    fact_check_api_url = f"https://toolbox.google.com/factcheck/api/v1/claimsearch?query={content_text[:200]}"
    try:
        response = requests.get(fact_check_api_url)
        data = response.json()
        if "claims" in data and data["claims"]:
            return 80  # Indicates content is found in fact-check databases
        return 40  # No verification found
    except:
        return 50  # Default score indicating uncertainty

# === Helper Function: Retrieve Citation Count via Google Scholar API ===

def fetch_citation_count(webpage_url: str) -> int:
    """
    Queries Google Scholar via SerpAPI to retrieve citation counts for a given webpage.
    Returns the number of citations found.
    """
    serpapi_key = "YOUR_SERPAPI_KEY"
    search_params = {"q": webpage_url, "engine": "google_scholar", "api_key": serpapi_key}
    try:
        response = requests.get("https://serpapi.com/search", params=search_params)
        data = response.json()
        return len(data.get("organic_results", []))
    except:
        return 0  # Default assumption that no citations exist

In [6]:
user_prompt = "I have recently recovered from the flu, is it safe for me to visit my newborn niece?"
url_to_check = "https://www.bhtp.com/blog/when-safe-to-travel-with-newborn/"

# Call the correct function name: evaluate_url_authenticity
result = evaluate_url_authenticity(user_prompt, url_to_check)
print(result)

Device set to use cpu


{'Domain Reliability Score': 60, 'Content Alignment Score': 45.83556056022644, 'Factual Accuracy Score': 50, 'Bias Tendency Score': 30, 'Citation Reliability Score': 0, 'Overall Credibility Score': 44.75066816806793}
