In [3]:
# AI NEWS ORCHESTRATOR (RSS VERSION) — FULL FIXED
# Requirements:
# pip install requests pandas beautifulsoup4 transformers sentencepiece spacy dateparser python-dateutil

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import dateparser
from dateparser.search import search_dates
import spacy
from transformers import pipeline
import json
from datetime import datetime
import numpy as np
import traceback

print("Libraries imported")

# -------------------------------
# STEP 1 — Fetch News via Google News RSS
# -------------------------------
def fetch_news_rss(topic, max_articles=10):
    topic_query = topic.replace(" ", "+")
    rss_url = f"https://news.google.com/rss/search?q={topic_query}"

    r = requests.get(rss_url, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.content, "xml")

    items = soup.find_all("item")[:max_articles]

    articles = []
    for item in items:
        title = item.title.text if item.title else ""
        link = item.link.text if item.link else ""
        pub_date = item.pubDate.text if item.pubDate else None
        description = item.description.text if item.description else ""
        source = item.source.text if item.source else "Unknown"

        articles.append({
            "title": title,
            "url": link,
            "date": pub_date,
            "content": description,
            "source": source
        })

    return pd.DataFrame(articles)

# -------------------------------
# STEP 2 — Clean Article Text
# -------------------------------
try:
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    print("spaCy model not found. Run: python -m spacy download en_core_web_sm")
    raise

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def normalize_date(date_str):
    try:
        return dateparser.parse(date_str)
    except Exception:
        return None

# -------------------------------
# STEP 3 — Summarize Articles (HuggingFace DistilBART)
# -------------------------------
try:
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    print("Loaded distilbart summarizer")
except Exception as e:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    print("Loaded BART summarizer")

def summarize_article(text, max_len=120, min_len=30):
    text = text or ""
    if len(text.split()) < 20:
        return text
    try:
        words = text.split()
        if len(words) > 800:
            chunks = [" ".join(words[i:i+500]) for i in range(0, len(words), 500)]
            chunk_summaries = []
            for c in chunks:
                out = summarizer(c, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
                chunk_summaries.append(out)
            joined = " ".join(chunk_summaries)
            final = summarizer(joined, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
            return final
        else:
            return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
    except Exception as ex:
        print("Summarization failed:", ex)
        return text

# -------------------------------
# STEP 4 — Event Extraction (Robust)
# -------------------------------
def extract_events(summary_text, published_date=None):
    events = []
    doc = nlp(summary_text)
    for sent in doc.sents:
        s = sent.text.strip()
        if not s:
            continue
        # heuristic: check for event keywords
        if re.search(r"\b(announce|announced|launch|launched|attack|attacked|explode|explosion|arrest|arrived|landed|signed|confirmed|investigate|died|killed)\b", s, flags=re.I):
            dt = None
            found = search_dates(s, languages=['en'])
            if found:
                dt = found[0][1].date().isoformat()
            elif published_date:
                dt = published_date.date().isoformat()
            events.append({"event": s, "date": dt})
    # fallback: entire summary as single event
    if not events:
        events.append({"event": summary_text.strip(), "date": published_date.date().isoformat() if published_date else None})
    return events

# -------------------------------
# STEP 5 — Full Pipeline Function
# -------------------------------
def process_topic(topic, max_articles=10):
    df = fetch_news_rss(topic, max_articles)
    df['clean_content'] = df['content'].apply(clean_text)
    df['clean_date'] = df['date'].apply(normalize_date)
    df['summary'] = df['clean_content'].apply(lambda x: summarize_article(x, max_len=100, min_len=20))
    df['events_list'] = df.apply(lambda row: extract_events(row['summary'], row['clean_date']), axis=1)

    # Build timeline
    timeline = []
    for idx, row in df.iterrows():
        events = row.get('events_list') or []
        for e in events:
            if not isinstance(e, dict):
                continue
            event_text = e.get('event') or ""
            event_date = e.get('date') if e.get('date') else None
            if not event_date and pd.notnull(row['clean_date']):
                try:
                    event_date = row['clean_date'].date().isoformat()
                except:
                    event_date = None
            timeline.append({"date": event_date, "event": event_text.strip(), "source": row['source'], "url": row['url']})

    timeline_df = pd.DataFrame(timeline)
    if not timeline_df.empty:
        timeline_df['date_parsed'] = timeline_df['date'].apply(lambda x: dateparser.parse(x) if x else pd.NaT)
        timeline_df = timeline_df.sort_values('date_parsed')

    # Source reliability
    def score_source(source):
        articles = df[df['source']==source]
        if len(articles)==0: return 0.0
        return float(min(1.0, np.mean(articles['summary'].apply(lambda x: len(str(x).split())))/200.0))
    df['reliability'] = df['source'].apply(score_source)

    # Final JSON output
    final_output = {
        "event_title": topic,
        "generated_at": datetime.utcnow().isoformat() + "Z",
        "timeline": timeline_df[['date','event','source','url']].fillna("").to_dict(orient='records') if not timeline_df.empty else [],
        "summary": summarize_article(" ".join(df['summary'].dropna().tolist())) if len(df['summary'].dropna())>0 else "",
        "sources": df[['source','url','reliability']].drop_duplicates().to_dict(orient='records')
    }

    return df, timeline_df, final_output

# -------------------------------
# STEP 6 — Run Example
# -------------------------------
topic = "Delhi Bomb Blast"
df, timeline_df, final_output = process_topic(topic)

print(json.dumps(final_output, indent=2)[:2000])  # preview first 2k chars


Libraries imported


Device set to use cpu


Loaded distilbart summarizer


Your max_length is set to 100, but your input_length is only 76. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
  "generated_at": datetime.utcnow().isoformat() + "Z",


{
  "event_title": "Delhi Bomb Blast",
  "generated_at": "2025-11-15T07:59:28.268534Z",
  "timeline": [
    {
      "date": "2025-11-12",
      "event": "Delhi blast case highlights| Association of Indian Universities suspends membership of Al-Falah&nbsp;&nbsp;Deccan Herald",
      "source": "Deccan Herald",
      "url": "https://news.google.com/rss/articles/CBMigwJBVV95cUxNRGhSRm5rT3FlZi1KOHFhamJ1RWk4NTFQdnZfdFo0MnlncFZnQmNQaGk5VVZacERUdzc4a0ZLdGNVeG5IT1BVYU5SZWl3UXZFUGhVaVl4eDNIX1U3cWxOSUFpZXBwME5fQk5uRVY0R19UWlVNYWxhbEIyUmd2U09oVVByTWczYkN4Y2dTbWZ6c3FVOEZ3VTlFZmFLUlBJVkYyLUFveWhncW0tNGxsR1ZpXzFUTEFWMGlSZEd5bThCUzlwWHJ4eVctVjBaSGNjczc2NHpzRE95bjBBSXFrOHNEeG1RMWd6MFpFYWxGT01qcWRmUV9mTV90SVZuOHZ5eXZudmJz?oc=5"
    },
    {
      "date": "2025-11-12",
      "event": "Do US remarks on Delhi, Islamabad blasts reveal Trump's bias?&nbsp;&nbsp;India Today",
      "source": "India Today",
      "url": "https://news.google.com/rss/articles/CBMi_wFBVV95cUxNM3lWQU5wbTZZeFM3aWRWWGZGRjV2OTVpY2VZd1