In [2]:
!pip install flask flask-cors pyngrok newspaper3k lime tensorflow nltk

Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting pyngrok
  Downloading pyngrok-7.2.4-py3-none-any.whl.metadata (8.7 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.2.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collectin

In [6]:
!pip uninstall -y lxml
!pip install lxml==4.9.3
!pip install newspaper3k
!pip install lxml_html_clean


Found existing installation: lxml 5.3.2
Uninstalling lxml-5.3.2:
  Successfully uninstalled lxml-5.3.2
Collecting lxml==4.9.3
  Downloading lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.9.3


Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.2


In [2]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [4]:
!ngrok config add-authtoken "2vMiRmqM75KyG7V5xKNGOelIZ78_EkWAHZkZRzLB4mqRroVt"

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
from newspaper import Article
from lime.lime_text import LimeTextExplainer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import re
import nltk
import numpy as np
import requests
import fitz  # PyMuPDF for PDF reading
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from pyngrok import ngrok

# NLTK setup
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Flask setup
app = Flask(__name__)
CORS(app)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(stemmed_tokens)

# Load model and tokenizer
model = load_model("lstm_model.h5")
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

MAXLEN = 200

# LIME explanation function
def explain_with_lime(model, tokenizer, text, maxlen=MAXLEN):
    explainer = LimeTextExplainer(class_names=["fake", "real"])

    def predict_proba(texts):
        sequences = tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(sequences, maxlen=maxlen)
        probs = model.predict(padded, verbose=0)
        return np.hstack([1 - probs, probs])

    exp = explainer.explain_instance(text, predict_proba, num_features=10)
    return [word for word, _ in exp.as_list()]

# ✅ NEW: Extract text from a PDF URL
def extract_text_from_pdf(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise ValueError("Failed to download PDF.")

    with open("temp.pdf", "wb") as f:
        f.write(response.content)

    text = ""
    doc = fitz.open("temp.pdf")
    for page in doc:
        text += page.get_text()
    return text

# Main API endpoint
@app.route("/analyze", methods=["POST"])
def analyze():
    try:
        url = request.json.get("url")
        if not url or not url.startswith("http"):
            return jsonify({"error": "Invalid or missing URL"}), 400

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            )
        }

        # ✅ Use different parsing methods based on URL type
        if url.lower().endswith(".pdf"):
            try:
                raw_text = extract_text_from_pdf(url)
            except Exception as e:
                return jsonify({"error": f"PDF extraction failed: {str(e)}"}), 400
        else:
            article = Article(url, request_headers=headers)
            try:
                article.download()
                article.parse()
                raw_text = article.text
            except Exception as e:
                return jsonify({"error": f"Failed to fetch article: {str(e)}"}), 400

        if not raw_text.strip():
            return jsonify({"error": "Could not extract article text."}), 400

        # Preprocess + predict
        processed = preprocess_text(raw_text)
        sequence = tokenizer.texts_to_sequences([processed])
        padded = pad_sequences(sequence, maxlen=MAXLEN)
        prob = model.predict(padded, verbose=0)[0][0]
        credibility = "High" if prob < 0.5 else "Low"

        keywords = explain_with_lime(model, tokenizer, raw_text)

        return jsonify({
            "probability": float(prob),
            "credibility": credibility,
            "keywords": keywords
        })

    except Exception as e:
        print(f"❌ Error occurred: {e}")
        return jsonify({"error": str(e)}), 500

# Start server using ngrok
public_url = ngrok.connect(5000)
print(f"🔗 Your public API URL: {public_url}/analyze")
app.run(port=5000)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


🔗 Your public API URL: NgrokTunnel: "https://8d54-35-221-182-198.ngrok-free.app" -> "http://localhost:5000"/analyze
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [19/Apr/2025 04:09:29] "[33mOPTIONS / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [19/Apr/2025 04:09:54] "[33mOPTIONS / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [19/Apr/2025 04:10:04] "[33mOPTIONS / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [19/Apr/2025 04:10:22] "[33mOPTIONS / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [19/Apr/2025 04:12:57] "OPTIONS /analyze HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [19/Apr/2025 04:13:23] "POST /analyze HTTP/1.1" 200 -
