### **Install Dependencies**

In [None]:
!pip install -q requests beautifulsoup4 textblob nltk transformers torch gradio pandas


### **Import Libraries**

In [None]:
import os, re, json, requests
from datetime import datetime
from bs4 import BeautifulSoup
from textblob import TextBlob
from transformers import pipeline
import nltk

try:
    from google.colab import userdata
except Exception:
    userdata = None

for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger", "wordnet", "stopwords"]:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        print(f"Note: could not download {pkg}: {e}")

print("Imports ready and NLTK data prepared")


Imports ready and NLTK data prepared


### **Secret Key Management**

In [None]:
def get_api_key(key_name):
    try:
        if userdata is None:
            return None
        return userdata.get(key_name)
    except Exception:
        return None

SERPER_API_KEY = get_api_key('SERPER_API_KEY')


### **All Classes(ES,AIAnalyzer,WebVerifier,Main Fn)**

In [None]:
import re, requests
from collections import Counter

class ExpertSystem:

    def __init__(self):
        # 1. Try to fetch clickbait word list from an online gist
        try:
            url = "https://gist.githubusercontent.com/amitness/0a2ddbcb61c34eab04bad5a17fd8c86b/raw/clickbait.csv"
            text = requests.get(url, timeout=10).text
            words = [
                w for line in text.splitlines()
                for w in line.lower().split()
                if w.isalpha() and len(w) > 3
            ]
            self.clickbait_words = [w for w, _ in Counter(words).most_common(300)]
            print(f"Loaded {len(self.clickbait_words)} online clickbait words")
        except Exception as e:
            # Fallback set if online fetch fails
            self.clickbait_words = [
                "shocking","unbelievable","you won't believe","secret","exposed",
                "revealed","doctors hate","this one trick","breaking","urgent","alert",
                "must see","incredible","miracle","cure","breakthrough","proven",
                "scientists claim","researchers say","study finds","new study",
                "amazing discovery","medical wonder"
            ]
            print(f"Using fallback clickbait list ({len(self.clickbait_words)} words). Error: {e}")

        # 2. Define secondary heuristic lists
        self.extreme_words = [
            "always","never","all","none","everyone","nobody","everywhere","nowhere",
            "completely","totally","absolutely","entirely","guaranteed","100%","undeniable"
        ]

        self.emotional_words = [
            "outrage","shocking","devastating","horrific","amazing","incredible",
            "unbelievable","stunning","mind-blowing","breakthrough","miracle",
            "revolutionary","groundbreaking"
        ]

        # 3. Common myth claims
        self.consensus_myths = [
            r"\bsun\s+orbits\s+the\s+earth\b",
            r"\bearth\s+is\s+flat\b",
            r"\b5g\s+causes\s+covid\b",
            r"\bmoon\s+landing\s+(is|was)\s+(fake|a\s+hoax)\b",
        ]

    def analyze(self, text):
        t = (text or "").strip()
        tl = t.lower()
        score = 90
        flags = []

        # Consensus myth detection
        for pat in self.consensus_myths:
            if re.search(pat, tl):
                score -= 40
                flags.append("Contradicts established scientific consensus")
                break

        # Clickbait / sensational language
        c_count = sum(1 for w in self.clickbait_words if w in tl)
        if c_count > 2:
            score -= 20; flags.append(f"High clickbait language ({c_count} instances)")
        elif c_count > 0:
            score -= 10; flags.append(f"Some clickbait or sensational terms ({c_count} instances)")

        # Extreme / absolute language
        x_count = sum(1 for w in self.extreme_words if w in tl)
        if x_count > 5:
            score -= 15; flags.append(f"Excessive extreme/absolute terms ({x_count} instances)")
        elif x_count > 0:
            score -= 5; flags.append(f"Some extreme or absolute terms ({x_count} instances)")

        # Emotional tone
        e_count = sum(1 for w in self.emotional_words if w in tl)
        if e_count > 3:
            score -= 15; flags.append(f"Highly emotional tone ({e_count} instances)")
        elif e_count > 0:
            score -= 5; flags.append(f"Some emotional tone ({e_count} instances)")

        # Suspicious numeric percentages
        if re.search(r"\b\d{2,}%\b", tl):
            score -= 10; flags.append("Suspicious numeric claim (percentage)")

        # Hedging / rumor wording
        if re.search(r"\b(reportedly|rumor|anonymous source|insider|leaked)\b", tl):
            score -= 5; flags.append("Hedging/rumor wording without clear attribution")

        # Lack of sourcing
        if not re.search(r"(according to|source|study|research|report|journal|paper|data|doi:|https?://)", tl):
            score -= 10; flags.append("No explicit sources or citations found")

        # Exclamation / capitalization penalties
        if text.count("!") > 3:
            score -= 10; flags.append("Excessive exclamation marks")
        caps_ratio = sum(1 for c in text if c.isupper()) / max(1, len(text))
        if caps_ratio > 0.15:
            score -= 10; flags.append("Excessive capitalization")

        return max(0, min(100, score)), flags


class AIAnalyzer:
    def __init__(self):
        print("Loading AI models...")
        try:
            self.sentiment_analyzer = pipeline(
                "sentiment-analysis",
                model="distilbert-base-uncased-finetuned-sst-2-english"
            )
            print("Sentiment analyzer loaded")
        except Exception as e:
            print(f"Could not load sentiment analyzer: {e}")
            self.sentiment_analyzer = None

    def analyze_sentiment(self, text):
        t = (text or "")[:512]
        if not t:
            return {"label": "NEUTRAL", "score": 0.5}
        if self.sentiment_analyzer is None:
            blob = TextBlob(t)
            p = blob.sentiment.polarity
            if p > 0.1:  return {"label": "POSITIVE", "score": (p + 1) / 2}
            if p < -0.1: return {"label": "NEGATIVE", "score": (1 - p) / 2}
            return {"label": "NEUTRAL", "score": 0.5}
        return self.sentiment_analyzer(t)[0]

    def analyze_writing_quality(self, text):
        t = text or ""
        blob = TextBlob(t)

        sentences = blob.sentences
        words = blob.words

        avg_sentence_length = (len(words) / max(1, len(sentences))) if sentences else 0

        # Cap to 250 words to keep batch runs fast
        words_for_spell = words[:250]
        misspelled = sum(1 for w in words_for_spell if w.correct() != w)
        spelling_score = 100 if not words_for_spell else max(0, 100 - (misspelled / len(words_for_spell) * 100))

        return {
            "avg_sentence_length": avg_sentence_length,
            "spelling_score": spelling_score,
            "subjectivity": blob.sentiment.subjectivity
        }


class WebVerifier:
    def __init__(self, api_key):
        self.api_key = api_key
        self.search_url = "https://google.serper.dev/search"

    def search_news(self, query):
        if not self.api_key:
            return None
        headers = {'X-API-KEY': self.api_key, 'Content-Type': 'application/json'}
        payload = json.dumps({"q": query, "num": 5})
        try:
            r = requests.post(self.search_url, headers=headers, data=payload, timeout=10)
            if r.status_code == 200:
                return r.json()
            else:
                print(f"Search API error: {r.status_code}")
                return None
        except Exception as e:
            print(f"Search error: {e}")
            return None

    def verify_claim(self, news_text):
        blob = TextBlob(news_text or "")
        query = str(blob.sentences[0]) if blob.sentences else (news_text or "")[:100]
        search_results = self.search_news(query)
        if not search_results:
            return {"sources_found": 0, "reputable_sources": 0, "credibility_boost": 0, "sources": []}

        organic = search_results.get("organic", [])
        reputable_domains = [
            "reuters.com","apnews.com","bbc.com","cnn.com","nytimes.com",
            "washingtonpost.com","theguardian.com","npr.org","bloomberg.com",
            "forbes.com","wsj.com","aljazeera.com","scientificamerican.com",
            "nature.com","science.org"
        ]

        sources, rep = [], 0
        for res in organic[:5]:
            link = res.get("link","")
            title = res.get("title","")
            snippet = res.get("snippet","")
            is_rep = any(d in link for d in reputable_domains)
            rep += int(is_rep)
            sources.append({"title": title, "url": link, "snippet": snippet, "is_reputable": is_rep})

        boost = 20 if rep >= 3 else 15 if rep >= 2 else 10 if rep >= 1 else (5 if len(sources) >= 3 else 0)
        return {"sources_found": len(sources), "reputable_sources": rep, "credibility_boost": boost, "sources": sources}


class HoaxDetector:

    def __init__(self):
        self.expert_system = ExpertSystem()
        self.ai_analyzer = AIAnalyzer()
        self.web_verifier = WebVerifier(SERPER_API_KEY) if SERPER_API_KEY else None
        self.strict_mode = True

    def analyze(self, news_text, quiet: bool = False):
        t = (news_text or "")
        tl = t.lower()

        # 1) Expert rules
        expert_score, expert_flags = self.expert_system.analyze(t)

        # 2) AI analysis
        sentiment = self.ai_analyzer.analyze_sentiment(t)
        writing_quality = self.ai_analyzer.analyze_writing_quality(t)

        ai_adj = 0
        if writing_quality.get("spelling_score", 100) < 70: ai_adj -= 10
        if writing_quality.get("subjectivity", 0) > 0.7:   ai_adj -= 10

        # 3) Web verification
        verification_result = None
        web_adj = 0
        if self.web_verifier:
            verification_result = self.web_verifier.verify_claim(t)
            web_adj = verification_result.get("credibility_boost", 0)
        else:
            verification_result = {
                "sources_found": 0, "reputable_sources": 0, "credibility_boost": 0, "sources": [],
                "note": "Web verification unavailable (SERPER_API_KEY not configured)."
            }

        # 4) Final score + strict caps
        final_score = max(0, min(100, expert_score + ai_adj + web_adj))

        no_web = (self.web_verifier is None)
        no_sources_found = (verification_result.get("sources_found", 0) == 0)

        if self.strict_mode and ("study" in tl or "journal" in tl) and (no_web or no_sources_found):
            final_score = min(final_score, 60)

        # Verdict
        if final_score >= 80: verdict = "LIKELY CREDIBLE"
        elif final_score >= 60: verdict = "QUESTIONABLE — VERIFY FURTHER"
        elif final_score >= 40: verdict = "LIKELY MISLEADING"
        else: verdict = "LIKELY FAKE NEWS"

        return {
            "final_score": final_score,
            "verdict": verdict,
            "expert_score": expert_score,
            "expert_flags": expert_flags,
            "ai_analysis": {
                "sentiment": sentiment,
                "writing_quality": writing_quality
            },
            "web_verification": verification_result
        }

# Reinitialize the detector to apply changes
try:
    detector = HoaxDetector()
    print("Detector ready")
except NameError:
    detector = HoaxDetector()
    print("Detector created")

Loaded 300 online clickbait words
Loading AI models...


Device set to use cuda:0


Sentiment analyzer loaded
Detector ready


### **Markdown For GUI**

In [None]:
def result_to_markdown(text, res):
    def bullet_list(items):
        if not items: return "None"
        return "\n".join([f"- {it}" for it in items])

    sent = res["ai_analysis"]["sentiment"]
    wq   = res["ai_analysis"]["writing_quality"]
    web  = res["web_verification"] or {}

    # Top sources list
    sources_md = "None"
    srcs = web.get("sources", [])[:3]
    if srcs:
        lines = []
        for i, s in enumerate(srcs, 1):
            badge = " (reputable)" if s.get("is_reputable") else ""
            url = s.get("url","")
            title = s.get("title","(title unavailable)")
            snip = s.get("snippet","")
            lines.append(f"**{i}. [{title}]({url})**{badge}\n   {snip}")
        sources_md = "\n\n".join(lines)

    web_note = ""
    if web.get("note"):
        web_note = f"\n> ℹ{web['note']}"

    md = f"""### Input
{text.strip()[:800] + ('…' if len(text.strip())>800 else '')}

---

### Final Assessment
- **Final Credibility Score:** **{res['final_score']}/100**
- **Verdict:** {res['verdict']}

### Expert System
- **Expert Score:** {res['expert_score']}/100
- **Flags:**
{bullet_list(res['expert_flags'])}

### AI Analysis
- **Sentiment:** {sent.get('label','')} ({sent.get('score',0):.2f})
- **Avg Sentence Length:** {wq['avg_sentence_length']:.1f} words
- **Spelling Quality:** {wq['spelling_score']:.1f}/100
- **Subjectivity:** {wq['subjectivity']:.2f}

### Web Verification
- **Sources Found:** {web.get('sources_found',0)}
- **Reputable Sources:** {web.get('reputable_sources',0)}
- **Credibility Boost:** +{web.get('credibility_boost',0)}
{web_note}

**Top Sources:**
{sources_md}
"""
    return md


### **Gradio GUI**

In [None]:
import os
import gradio as gr
import pandas as pd
import tempfile

def _write_df_to_tempfile(df: pd.DataFrame) -> str:
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    df.to_csv(tmp.name, index=False, encoding="utf-8")
    return tmp.name

def _analyze_list(claims):
    rows = []
    for t in claims:
        try:
            res = detector.analyze(t, quiet=True)
            wq  = res["ai_analysis"]["writing_quality"]
            web = res["web_verification"] or {}
            rows.append({
                "text": t[:120] + ("…" if len(t) > 120 else ""),
                "final_score": res["final_score"],
                "verdict": res["verdict"],
                "expert_score": res["expert_score"],
                "subjectivity": round(wq.get("subjectivity", 0.0), 3),
                "spelling": round(wq.get("spelling_score", 0.0), 1),
                "reputable_sources": web.get("reputable_sources", 0),
            })
        except Exception as e:
            rows.append({
                "text": t[:120] + ("…" if len(t) > 120 else ""),
                "final_score": None,
                "verdict": f"Error: {e}",
                "expert_score": None,
                "subjectivity": None,
                "spelling": None,
                "reputable_sources": None,
            })
    df = pd.DataFrame(rows, columns=[
        "text","final_score","verdict","expert_score","subjectivity","spelling","reputable_sources"
    ])
    csv_path = _write_df_to_tempfile(df)
    return df, csv_path

def analyze_one(text):
    if not text or not text.strip():
        return "Please paste a headline or paragraph."
    res = detector.analyze(text, quiet=True)
    return result_to_markdown(text, res)

def analyze_batch_multiline(big_text):
    claims = [ln.strip() for ln in (big_text or "").splitlines() if ln.strip()]
    if not claims:
        df = pd.DataFrame(columns=["text","final_score","verdict","expert_score","subjectivity","spelling","reputable_sources"])
        return df, _write_df_to_tempfile(df)
    return _analyze_list(claims)

def analyze_batch_file(file_path: str):
    if not file_path:
        df = pd.DataFrame(columns=["text","final_score","verdict","expert_score","subjectivity","spelling","reputable_sources"])
        return df, _write_df_to_tempfile(df)

    name = file_path.lower()
    try:
        if name.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read()
            claims = [ln.strip() for ln in content.splitlines() if ln.strip()]

        elif name.endswith(".csv"):
            df_in = pd.read_csv(file_path)
            claims = df_in.iloc[:, 0].dropna().astype(str).tolist()

        elif name.endswith((".xls", ".xlsx")):
            df_in = pd.read_excel(file_path)
            claims = df_in.iloc[:, 0].dropna().astype(str).tolist()

        elif name.endswith(".ods"):
            try:
                df_in = pd.read_excel(file_path, engine="odf")
                claims = df_in.iloc[:, 0].dropna().astype(str).tolist()
            except Exception as e:
                raise RuntimeError("Reading .ods requires 'odfpy'. Install with: !pip install odfpy") from e

        else:
            raise ValueError("Unsupported file type. Please upload .txt, .csv, .xlsx, .xls, or .ods.")
    except Exception as e:
        df = pd.DataFrame([{"text": f"Error reading file: {e}"}])
        return df, _write_df_to_tempfile(df)

    if not claims:
        df = pd.DataFrame(columns=["text","final_score","verdict","expert_score","subjectivity","spelling","reputable_sources"])
        return df, _write_df_to_tempfile(df)

    return _analyze_list(claims)

with gr.Blocks(title="Hoax Detector") as demo:

    with gr.Tab("Single Check"):
        ta = gr.TextArea(label="Paste a headline or paragraph", lines=6, placeholder="Paste text here…")
        btn = gr.Button("Analyze")
        out = gr.Markdown()
        btn.click(fn=analyze_one, inputs=ta, outputs=out)

    with gr.Tab("Batch Check (Multi-line)"):
        ta_batch = gr.TextArea(label="One claim per line", lines=10, placeholder="Line 1: …\nLine 2: …\nLine 3: …")
        btn_b = gr.Button("Analyze Batch")
        df_out = gr.Dataframe(label="Results", interactive=False)
        dl = gr.File(label="Download CSV", interactive=False)
        btn_b.click(fn=analyze_batch_multiline, inputs=ta_batch, outputs=[df_out, dl])

    with gr.Tab("Batch Check (Upload File)"):
        fi = gr.File(
            label="Upload .txt, .csv, .xlsx, .xls, or .ods file (first column = claims)",
            file_types=[".txt", ".csv", ".xlsx", ".xls", ".ods"],
            type="filepath"
        )
        btn_f = gr.Button("Analyze File")
        df_out_f = gr.Dataframe(label="Results", interactive=False)
        dl_f = gr.File(label="Download CSV", interactive=False)
        btn_f.click(fn=analyze_batch_file, inputs=fi, outputs=[df_out_f, dl_f])

demo.queue().launch(debug=False)



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a6bf7b55968500ace5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
detector.analyze("Sun orbits the Earth")


{'final_score': 35,
 'verdict': 'LIKELY FAKE NEWS',
 'expert_score': 40,
 'expert_flags': ['Contradicts established scientific consensus',
  'No explicit sources or citations found'],
 'ai_analysis': {'sentiment': {'label': 'POSITIVE',
   'score': 0.9980733394622803},
  'writing_quality': {'avg_sentence_length': 4.0,
   'spelling_score': 50.0,
   'subjectivity': 0.0}},
 'web_verification': {'sources_found': 5,
  'reputable_sources': 0,
  'credibility_boost': 5,
  'sources': [{'title': "Earth's orbit",
    'url': 'https://en.wikipedia.org/wiki/Earth%27s_orbit',
    'snippet': 'Earth orbits the Sun at an average distance of 149.60 million km (92.96 million mi), or 8.317 light-minutes, [1] in a counterclockwise direction.',
    'is_reputable': False},
   {'title': 'Our Sun: Facts',
    'url': 'https://science.nasa.gov/sun/facts/',
    'snippet': 'At its equator, the Sun completes one rotation in 25 Earth days. At its poles, the Sun rotates once on its axis every 36 Earth days. Measuring a