In [1]:
print("hello world")

hello world


In [2]:
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd


In [3]:
class RSSIngestor:

    def __init__(self, feeds: list):
        self.feeds = feeds

    def fetch(self):
        articles = []

        for url in self.feeds:
            feed = feedparser.parse(url)

            for entry in feed.entries:
                articles.append({
                    "title": entry.title,
                    "content": entry.summary if hasattr(entry, "summary") else "",
                    "link": entry.link,
                    "source": "RSS",
                    "published_at": datetime(*entry.published_parsed[:6])
                        if hasattr(entry, "published_parsed")
                        else datetime.utcnow()
                })

        return articles


In [4]:
class APIIngestor:

    def __init__(self, endpoints: list):
        self.endpoints = endpoints

    def fetch(self):
        articles = []

        for url in self.endpoints:
            try:
                resp = requests.get(url, timeout=10)
                data = resp.json()

                for item in data.get("articles", []):
                    articles.append({
                        "title": item.get("title"),
                        "content": item.get("content", ""),
                        "link": item.get("url"),
                        "source": "API",
                        "published_at": datetime.utcnow()
                    })

            except Exception as e:
                print("API Error:", e)

        return articles


In [5]:
class ScraperIngestor:

    def __init__(self, pages: list):
        self.pages = pages

    def fetch(self):
        articles = []

        for url in self.pages:
            try:
                resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
                soup = BeautifulSoup(resp.text, "html.parser")

                title = soup.find("h1")
                content = soup.find("article") or soup.find("p")

                if title and content:
                    articles.append({
                        "title": title.get_text(strip=True),
                        "content": content.get_text(strip=True),
                        "link": url,
                        "source": "SCRAPER",
                        "published_at": datetime.utcnow()
                    })

            except Exception as e:
                print("Scraper Error:", e)

        return articles


In [6]:
class IngestionAgent:

    def __init__(self, rss_feeds=None, api_endpoints=None, scraper_pages=None):

        self.rss = RSSIngestor(rss_feeds or [])
        self.api = APIIngestor(api_endpoints or [])
        self.scraper = ScraperIngestor(scraper_pages or [])

    def run(self):
        all_articles = []

        all_articles.extend(self.rss.fetch())
        all_articles.extend(self.api.fetch())
        all_articles.extend(self.scraper.fetch())

        print(f"Total articles fetched: {len(all_articles)}")
        return all_articles


In [None]:
rss_feeds = [
    "https://economictimes.indiatimes.com/markets/rssfeeds/1977021501.cms",
    "https://www.moneycontrol.com/rss/latestnews.xml",
    "https://feeds.finance.yahoo.com/rss/2.0/headline?s=AAPL&region=US&lang=en-US",
    "https://www.business-standard.com/rss/latest.rss"



]

api_endpoints = [
    "https://gnews.io/api/v4/top-headlines?token=demo&topic=business"
]

scraper_pages = [
    "https://www.reuters.com/markets/",
]


In [8]:
agent = IngestionAgent(
    rss_feeds=rss_feeds,
    api_endpoints=api_endpoints,
    scraper_pages=scraper_pages
)

articles = agent.run()


Total articles fetched: 65


In [9]:
df = pd.DataFrame(articles)
df.head()


Unnamed: 0,title,content,link,source,published_at
0,How small-town India gave D-St its next big IP...,Meesho's upcoming IPO highlights the rise of I...,https://economictimes.indiatimes.com/markets/i...,RSS,2025-11-28 08:51:04
1,Aequs IPO GMP jumps 21% ahead of launch next w...,Aequs Limited’s Rs 921.81-crore IPO opens on D...,https://economictimes.indiatimes.com/markets/i...,RSS,2025-11-28 08:40:18
2,4 years of fraud: How a Mumbai-based elderly c...,"A 72-year-old Mumbai resident, Bharat Harakcha...",https://economictimes.indiatimes.com/markets/s...,RSS,2025-11-28 08:37:06
3,Market lull a buying opportunity for long-term...,"Indian markets have hit fresh all-time highs, ...",https://economictimes.indiatimes.com/markets/e...,RSS,2025-11-28 07:54:16
4,Japan's Nikkei adds to weekly gain with eyes o...,"Japan's Nikkei 225 closed up 0.2% on Friday, e...",https://economictimes.indiatimes.com/markets/s...,RSS,2025-11-28 07:28:45


In [10]:
df['title']

0     How small-town India gave D-St its next big IP...
1     Aequs IPO GMP jumps 21% ahead of launch next w...
2     4 years of fraud: How a Mumbai-based elderly c...
3     Market lull a buying opportunity for long-term...
4     Japan's Nikkei adds to weekly gain with eyes o...
                            ...                        
60    Buy Jindal Drilling and Industries; target of ...
61    Buy Navneet Education; target of Rs 182: Prabh...
62    Sell Indus Tower; target of Rs 260: ICICI Secu...
63    Buy City Union Bank; target of Rs 170: ICICI S...
64    Buy Hatsun Agro Products; target of Rs 1190: I...
Name: title, Length: 65, dtype: object

In [11]:
df['source'].value_counts()


source
RSS    65
Name: count, dtype: int64