In [None]:
!pip install feedparser newspaper3k transformers torch lxml[html_clean] tqdm

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspap

In [None]:
import feedparser
from newspaper import Article
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import time
import json
from tqdm import tqdm
import os

# 2. Define RSS feeds
RSS_FEEDS = {
    "Times of India": "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",
    "The Hindu": "https://www.thehindu.com/feeder/default.rss",
    "Indian Express": "https://indianexpress.com/feed",
    "BBC News": "http://feeds.bbci.co.uk/news/rss.xml",
    "Reuters": "https://www.reutersagency.com/feed/?best-topics=top-news",
    "NDTV": "https://feeds.feedburner.com/ndtvnews-top-stories",
    "Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
    "The Guardian": "https://www.theguardian.com/world/rss",
    "Times Now": "https://www.timesnownews.com/rssfeedstopstories.cms",
    "Deccan Herald": "https://www.deccanherald.com/rss-feeds/top-stories"
}

# 3. Load models
device = 0 if torch.cuda.is_available() else -1
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# ML Classifier (transformer-based)
news_classifier = pipeline(
    "text-classification",
    model="classla/multilingual-IPTC-news-topic-classifier",
    device=device
)


# 🧠 Keyword-based news classification
def classify_article(text):
    try:
        text = text.strip()[:512]  # limit for tokenizer safety
        pred = news_classifier(text)
        label = pred[0]["label"]

        # Fallback to keyword-based rules if the ML label is fuzzy
        if label.lower() in ["other", "miscellaneous", "uncategorized"]:
            return classify_article_keywords(text)
        return label
    except:
        return classify_article_keywords(text)


# 4. Collect articles
articles = []
for source, feed_url in tqdm(RSS_FEEDS.items(), desc="Processing feeds"):
    try:
        feed = feedparser.parse(feed_url)
        for entry in feed.entries[:200]:  # Adjust number per source as needed
            title = entry.get('title', '')
            link = entry.get('link', '')
            published = entry.get('published', '') or entry.get('pubDate', '')
            try:
                article = Article(link)
                article.download()
                article.parse()
                full_text = article.text
                if not full_text or len(full_text) < 200:
                    continue
            except Exception:
                continue
            articles.append({
                "source": source,
                "title": title,
                "url": link,
                "published": published,
                "full_text": full_text
            })
            time.sleep(0.5)
    except Exception as e:
        print(f"Error processing {source}: {e}")
print(f"Collected {len(articles)} articles.")

# 5. Group articles by event/topic using semantic similarity
titles = [a['title'] for a in articles]
embeddings = embedder.encode(titles, convert_to_tensor=True)
clusters = []
used = set()
threshold = 0.7  # Similarity threshold; adjust as needed

for i, emb in enumerate(embeddings):
    if i in used:
        continue
    cluster = [i]
    for j in range(i+1, len(embeddings)):
        if j in used:
            continue
        sim = util.pytorch_cos_sim(emb, embeddings[j]).item()
        if sim > threshold:
            cluster.append(j)
            used.add(j)
    used.add(i)
    clusters.append(cluster)
print(f"Grouped into {len(clusters)} events/topics.")

# 6. For each event/topic, synthesize summary and full article
results = []
for cluster in tqdm(clusters, desc="Synthesizing events"):
    topic_articles = [articles[i] for i in cluster]
    combined_text = "\n\n".join([a['full_text'] for a in topic_articles])
    combined_titles = " | ".join([a['title'] for a in topic_articles])
    try:
        # Truncate to max input length for BART (1024 tokens ≈ 2000 chars)
        safe_text = combined_text[:2000]
        summary = summarizer(safe_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        synthesized_article = summarizer(safe_text, max_length=512, min_length=200, do_sample=False)[0]['summary_text']
    except Exception:
        summary = combined_text[:500]
        synthesized_article = combined_text[:2000]
    results.append({
        "event_titles": combined_titles,
        "sources": [a['source'] for a in topic_articles],
        "urls": [a['url'] for a in topic_articles],
        "category": classify_article(combined_titles + " " + summary + " " + synthesized_article),
        "summary": summary,
        "synthesized_article": synthesized_article
    })
print(f"✅ Synthesized {len(results)} multi-source event summaries.")
from google.colab import drive
drive.mount('/content/drive')

# Define output path
output_path = "/content/drive/MyDrive/New News/multi_source_event_synthesis.json"

# Delete old file if it exists
if os.path.exists(output_path):
    os.remove(output_path)
    print("🗑️ Previous JSON file deleted.")

# Save new JSON
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print(f"✅ JSON saved to Google Drive ➜ New News folder.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0
Processing feeds: 100%|██████████| 10/10 [05:41<00:00, 34.19s/it]


Collected 436 articles.
Grouped into 386 events/topics.


Synthesizing events:   0%|          | 0/386 [00:00<?, ?it/s]Your max_length is set to 512, but your input_length is only 422. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=211)
Synthesizing events:   0%|          | 1/386 [00:05<37:11,  5.80s/it]Your max_length is set to 512, but your input_length is only 429. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=214)
Synthesizing events:   1%|          | 2/386 [00:10<34:44,  5.43s/it]Your max_length is set to 512, but your input_length is only 315. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=157)
Synthesizing events:   1%|          | 3/386 [00:15<

✅ Synthesized 386 multi-source event summaries.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🗑️ Previous JSON file deleted.
✅ JSON saved to Google Drive ➜ New News folder.


In [None]:
# 🚀 Auto-Port Fallback Web Server with ngrok (Colab-safe)

# STEP 1: Install pyngrok and set authtoken (only needed once per session)
!pip install pyngrok --quiet
!ngrok config add-authtoken 302IgATicSaOpwphBxpiVwONdMs_27qxv9BJm7XC62mXZs7s

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Start server on a free port from your Drive folder
import os
import http.server
import socketserver
import threading
from pyngrok import ngrok

# 📁 Path to the folder you want to serve (should contain index.html & JSON)
serve_dir = '/content/drive/MyDrive/New News'
os.chdir(serve_dir)

# 👇 Auto-try ports from 8000 to 8010
PORT = None
for try_port in range(8000, 8010):
    try:
        class ReusableTCPServer(socketserver.TCPServer):
            allow_reuse_address = True
        Handler = http.server.SimpleHTTPRequestHandler
        httpd = ReusableTCPServer(("0.0.0.0", try_port), Handler)
        PORT = try_port
        break
    except OSError:
        continue

# ❌ Failed to find a free port
if PORT is None:
    raise RuntimeError("❌ All ports from 8000–8009 are in use. Please restart the runtime.")

# ✅ Start ngrok tunnel
public_url = ngrok.connect(PORT)
print(f"\n✅ Server running on port {PORT}")
print(f"🌐 Public URL: {public_url}/index.html")

# ✅ Start server in background thread
threading.Thread(target=httpd.serve_forever).start()
print("🚀 Do not stop this cell while using the site.")


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

✅ Server running on port 8001
🌐 Public URL: NgrokTunnel: "https://c8304829ed6c.ngrok-free.app" -> "http://localhost:8001"/index.html
🚀 Do not stop this cell while using the site.
