In [1]:
! pip install newspaper3k lxml_html_clean fastapi &> /dev/null

In [2]:
!pip install uvicorn nest-asyncio pyngrok transformers torch newspaper3k nltk &> /dev/null

In [3]:
import requests
from bs4 import BeautifulSoup
import time
from newspaper import Article
from urllib.parse import urlparse
import torch
from transformers import pipeline
import re

In [4]:
visited = set()
results = []

# https://gnews.io/dashboard
API_KEY = "03797cb80667beed8ea1bc74341941d5"
GNEWS_API_URL = "https://gnews.io/api/v4/search"

def get_gnews_articles(topic, max_articles=5):
    params = {
        "q": topic,
        "lang": "en",
        "max": max_articles,
        "token": API_KEY
    }

    try:
        response = requests.get(GNEWS_API_URL, params=params)
        response.raise_for_status()
        articles = response.json().get("articles", [])

        results = []
        for item in articles:
            results.append({
                'title': item.get("title", ""),
                'link': item.get("url", ""),
                'pubDate': item.get("publishedAt", "")
            })

        print(f"Found {len(results)} articles for topic '{topic}'")
        return results
    except Exception as e:
        print("GNews API search failed:", e)
        return []

def extract_info_with_newspaper(url):
    try:
        print(f"Extracting article from {url}...")
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.strip()

        if not text:
            print(f"No text found in article at {url}")
            return None

        # Extract metadata
        title = article.title or "No Title"
        date = article.publish_date.strftime("%Y-%m-%d") if article.publish_date else "Unknown"
        source = urlparse(url).netloc

        return {
            'url': url,
            'title': title,
            'date': date,
            'source': source,
            'text': clean_text(text)
        }

    except Exception as e:
        print(f"Failed to extract article from {url}: {e}")
        return None

def clean_text(text):
    lines = text.split('\n')
    cleaned = []
    for line in lines:
        line = line.strip()
        if len(line) > 5 and not line.startswith('ADVERTISEMENT'):
            cleaned.append(line)
    return ' '.join(cleaned)

def crawl_and_extract(url):
    global visited, results

    if url in visited:
        return

    visited.add(url)
    info = extract_info_with_newspaper(url)
    if info:
        results.append(info)
        print(f"Crawled: {url}")

def fetch_data(topic):
    resp = get_gnews_articles(topic, max_articles=5)

    for a in resp:
        print(f"Processing article: {a['title']}")
        crawl_and_extract(a['link'])
        print("=" * 50)

    print("Done. Crawled latest news data saved to results")

In [13]:
device = 0

# Load Hugging Face pipelines with proper error handling
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
except Exception as e:
    print(f"Error loading models: {e}")
    # Fallback to default models
    summarizer = pipeline("summarization", device=device)
    sentiment_analyzer = pipeline("sentiment-analysis", device=device)

def clean_text(text):
    """Clean text to remove problematic characters and tokens"""
    if not isinstance(text, str):
        return ""

    # Remove or replace problematic characters
    import re
    # Remove non-printable characters except common whitespace
    text = re.sub(r'[^\x20-\x7E\n\r\t]', ' ', text)
    # Replace multiple whitespace with single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()

    return text

def safe_chunk_text(text, max_length=900):
    """Safely chunk text with character limit consideration"""
    text = clean_text(text)
    if not text:
        return [""]

    # Use character-based chunking for better control
    chunks = []
    words = text.split()
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word) + 1  # +1 for space
        if current_length + word_length > max_length and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks if chunks else [""]

def safe_summarize(text, summarizer, max_attempts=3):
    """Safely summarize text with fallbacks"""
    if not text or len(text.strip()) < 10:
        return "Content too short to summarize."

    for attempt in range(max_attempts):
        try:
            # Progressive length reduction on each attempt
            max_input_length = 1000 - (attempt * 200)
            if len(text) > max_input_length:
                text = text[:max_input_length] + "..."

            result = summarizer(
                text,
                max_length=min(120, len(text.split()) // 2),
                min_length=min(30, len(text.split()) // 4),
                do_sample=False,
                truncation=True
            )
            return result[0]['summary_text']

        except Exception as e:
            print(f"Summarization attempt {attempt + 1} failed: {e}")
            if attempt == max_attempts - 1:
                # Final fallback: return first few sentences
                sentences = text.split('.')[:3]
                return '. '.join(sentences) + '.' if sentences else "Summarization failed."
            continue

def safe_sentiment_analysis(text, analyzer, max_attempts=3):
    """Safely analyze sentiment with fallbacks"""
    if not text or len(text.strip()) < 5:
        return {'label': 'NEUTRAL', 'score': 0.5}

    for attempt in range(max_attempts):
        try:
            # Progressive length reduction
            max_input_length = 400 - (attempt * 100)
            if len(text) > max_input_length:
                # Take from beginning and end to preserve context
                half_length = max_input_length // 2
                text = text[:half_length] + " ... " + text[-half_length:]

            result = analyzer(text, truncation=True)
            return result[0]

        except Exception as e:
            print(f"Sentiment analysis attempt {attempt + 1} failed: {e}")
            if attempt == max_attempts - 1:
                return {'label': 'NEUTRAL', 'score': 0.5}
            continue

def process_articles():
    """Process articles with comprehensive error handling"""
    global results

    if not results:
        print("No results to process")
        return []

    temp = []

    for idx, article in enumerate(results):
        try:
            print(f"Processing article {idx + 1}/{len(results)}: {article.get('title', 'Unknown')[:50]}...")

            # Get and clean the text
            article_text = article.get('text', article.get('content', ''))
            if not article_text:
                print(f"  Warning: No text content found for article {idx + 1}")
                temp.append({
                    'title': article.get('title', 'Unknown Title'),
                    'url': article.get('url', ''),
                    'summary': 'No content available for summarization.',
                    'sentiment': {'label': 'NEUTRAL', 'score': 0.5}
                })
                continue

            # Clean the text
            cleaned_text = clean_text(article_text)
            if len(cleaned_text) < 20:
                temp.append({
                    'title': article.get('title', 'Unknown Title'),
                    'url': article.get('url', ''),
                    'summary': 'Content too short to process.',
                    'sentiment': {'label': 'NEUTRAL', 'score': 0.5}
                })
                continue

            # Chunk the text safely
            chunks = safe_chunk_text(cleaned_text)

            # Summarize each chunk
            summaries = []
            for i, chunk in enumerate(chunks):
                print(f"  Summarizing chunk {i + 1}/{len(chunks)}...")
                summary = safe_summarize(chunk, summarizer)
                summaries.append(summary)

            full_summary = " ".join(summaries)

            # Analyze sentiment on a safe portion of text
            sentiment_text = cleaned_text[:300]  # Safe length for sentiment analysis
            print(f"  Analyzing sentiment...")
            sentiment = safe_sentiment_analysis(sentiment_text, sentiment_analyzer)

            temp.append({
                'title': article.get('title', 'Unknown Title'),
                'url': article.get('url', ''),
                'summary': full_summary,
                'sentiment': sentiment
            })

            print(f"  ✓ Article {idx + 1} processed successfully")

        except Exception as e:
            print(f"  ✗ Error processing article {idx + 1}: {e}")
            # Add a fallback entry
            temp.append({
                'title': article.get('title', f'Article {idx + 1}'),
                'url': article.get('url', ''),
                'summary': f'Error processing article: {str(e)[:100]}...',
                'sentiment': {'label': 'NEUTRAL', 'score': 0.5}
            })

    print(f"\nProcessing complete. Successfully processed {len(temp)} articles.")
    return temp

Device set to use cuda:0
Device set to use cuda:0


In [14]:
!ngrok config add-authtoken 2xmOnusF7Uhy7I9ChbKLH47TOzb_635DbfWmkg2yRmCY1Vmk

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI, Request
import nest_asyncio
from pyngrok import ngrok
import uvicorn

app = FastAPI()

@app.post("/analyze/")
async def process_topic(request: Request):
    data = await request.json()
    topic = data.get("topic", "")

    if not topic:
        return {"error": "No topic provided."}

    fetch_data(topic)
    inter = process_articles()

    return {"topic": topic, "results": inter}

# Launch ngrok and run server
public_url = ngrok.connect(8000, url="mudfish-glorious-jackal.ngrok-free.app")
print(f"Public URL: {public_url}/analyze/")

nest_asyncio.apply()
uvicorn.run(app, port=8000)

Public URL: NgrokTunnel: "https://mudfish-glorious-jackal.ngrok-free.app" -> "http://localhost:8000"/analyze/


INFO:     Started server process [880]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Found 5 articles for topic 'Politics in India'
Processing article: Mint Primer: Can Bangladesh afford to mix politics and trade?
Processing article: Parsa Venkateshwar Rao Jr | The Conundrum of Caste Calculus in India’s Polity
Processing article: Manipur can be India’s next sporting hub. Financial security need not depend on politics
Processing article: Why Modi keeps pushing India to the brink of war with Pakistan
Processing article: Politics latest: Indian workers exempt from UK's 'jobs tax' under new trade deal
Done. Crawled latest news data saved to results
Processing article 1/5: Mint Primer: Can Bangladesh afford to mix politics...
  Summarizing chunk 1/5...
  Summarizing chunk 2/5...
  Summarizing chunk 3/5...
  Summarizing chunk 4/5...
  Summarizing chunk 5/5...
  Analyzing sentiment...
  ✓ Article 1 processed successfully
Processing article 2/5: Parsa Venkateshwar Rao Jr | The Conundrum of Caste...
  Summarizing chunk 1/7...
  Summarizing chunk 2/7...
  Summarizing chunk 3/7..