In [3]:
print("hello world")

hello world


In [4]:
import feedparser
import requests

from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd


In [5]:
class RSSIngestor:

    def __init__(self, feeds: list):
        self.feeds = feeds

    def fetch(self):
        articles = []

        for url in self.feeds:
            feed = feedparser.parse(url)
            

            for entry in feed.entries:
                articles.append({
                    "title": entry.title,
                    "content": entry.summary if hasattr(entry, "summary") else "",
                    "link": entry.link,
                    "source": "RSS",
                    "published_at": datetime(*entry.published_parsed[:6])
                        if hasattr(entry, "published_parsed")
                        else datetime.utcnow()
                })

        return articles


In [6]:
class APIIngestor:

    def __init__(self, endpoints: list):
        self.endpoints = endpoints

    def fetch(self):
        articles = []

        for url in self.endpoints:
            try:
                resp = requests.get(url, timeout=10)
                data = resp.json()

                for item in data.get("articles", []):
                    articles.append({
                        "title": item.get("title"),
                        "content": item.get("content", ""),
                        "link": item.get("url"),
                        "source": "API",
                        "published_at": datetime.utcnow()
                    })

            except Exception as e:
                print("API Error:", e)

        return articles


In [7]:
class ScraperIngestor:

    def __init__(self, pages: list):
        self.pages = pages

    def fetch(self):
        articles = []

        for url in self.pages:
            try:
                resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
                soup = BeautifulSoup(resp.text, "html.parser")

                title = soup.find("h1")
                content = soup.find("article") or soup.find("p")

                if title and content:
                    articles.append({
                        "title": title.get_text(strip=True),
                        "content": content.get_text(strip=True),
                        "link": url,
                        "source": "SCRAPER",
                        "published_at": datetime.utcnow()
                    })

            except Exception as e:
                print("Scraper Error:", e)

        return articles


In [8]:
class IngestionAgent:

    def __init__(self, rss_feeds=None, api_endpoints=None, scraper_pages=None):

        self.rss = RSSIngestor(rss_feeds or [])
        self.api = APIIngestor(api_endpoints or [])
        self.scraper = ScraperIngestor(scraper_pages or [])

    def run(self):
        all_articles = []

        all_articles.extend(self.rss.fetch())
        all_articles.extend(self.api.fetch())
        all_articles.extend(self.scraper.fetch())

        print(f"Total articles fetched: {len(all_articles)}")
        return all_articles


In [9]:
rss_feeds = [
    "https://economictimes.indiatimes.com/markets/rssfeeds/1977021501.cms",
    "https://www.moneycontrol.com/rss/latestnews.xml",
    "https://feeds.finance.yahoo.com/rss/2.0/headline?s=AAPL&region=US&lang=en-US",
    "https://www.business-standard.com/rss/latest.rss"
]

api_key = "36b5d90eea8cd28253581f3f536871f5"   # your actual API key

api_endpoints = [
    f"https://gnews.io/api/v4/top-headlines?apikey={api_key}&topic=business&lang=en&max=10",
    f"https://gnews.io/api/v4/top-headlines?apikey={api_key}&topic=world&lang=en&max=10",
    f"https://gnews.io/api/v4/top-headlines?apikey={api_key}&topic=breaking-news&lang=en&max=10",
    f"https://gnews.io/api/v4/top-headlines?apikey={api_key}&topic=finance&lang=en&max=10"
]

scraper_pages = [
    "https://www.reuters.com/markets/"
]


In [10]:
agent = IngestionAgent(
    rss_feeds=rss_feeds,
    api_endpoints=api_endpoints,
    scraper_pages=scraper_pages
)

articles = agent.run()


Total articles fetched: 125


In [11]:
df = pd.DataFrame(articles)
df.head()


Unnamed: 0,title,content,link,source,published_at
0,Bitcoin rebounds 12% from last week’s $80K low...,Bitcoin rebounded nearly 12% from its $80K low...,https://economictimes.indiatimes.com/markets/c...,RSS,2025-11-29 08:42:57
1,"F&amp;O Talk| Nifty hits record high, but rall...",Markets extended their winning streak to a thi...,https://economictimes.indiatimes.com/markets/s...,RSS,2025-11-29 08:17:52
2,Bitcoin’s drawdown breaks old rule as volatili...,"Bitcoin's recent downturn, despite a significa...",https://economictimes.indiatimes.com/markets/c...,RSS,2025-11-29 08:02:24
3,Consumer demand strengthens ahead of festive &...,Consumer demand is strengthening ahead of the ...,https://economictimes.indiatimes.com/markets/s...,RSS,2025-11-29 07:57:38
4,Sebi imposes 7-day ban on Prabhudas Lilladher ...,Sebi has barred Prabhudas Lilladher from takin...,https://economictimes.indiatimes.com/markets/s...,RSS,2025-11-29 07:53:47


In [12]:
df['title'].head(1)

0    Bitcoin rebounds 12% from last week’s $80K low...
Name: title, dtype: object

In [13]:
df['source'].value_counts()


source
RSS    85
API    40
Name: count, dtype: int64

In [18]:
!pip install requests beautifulsoup4 lxml


Collecting lxml
  Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl (4.0 MB)
Installing collected packages: lxml
Successfully installed lxml-6.0.2



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
def extract_links(url, link_selector="a", limit=20):
    """
    Extract article links from the homepage.
    """
    links = []
    
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(resp.text, "html.parser")

        # Find all links
        for tag in soup.select(link_selector):
            href = tag.get("href")
            if not href:
                continue

            # Clean full URLs
            if href.startswith("/"):
                href = "https://www.reuters.com" + href

            # Only take article-like URLs
            if "markets" in href or "business" in href or "finance" in href:
                links.append(href)

            if len(links) >= limit:
                break

    except Exception as e:
        print("Link extraction error:", e)

    return list(set(links))  # remove duplicates


In [30]:
def scrape_article(url):
    """
    Scrapes title + main content from an article page.
    """
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(resp.text, "html.parser")

        # Try multiple title patterns
        title = None
        for selector in ["h1", ".article-title", ".headline", ".title"]:
            t = soup.select_one(selector)
            if t:
                title = t.get_text(strip=True)
                break

        # Try multiple content sections
        content = ""
        for selector in ["article", ".article-content", ".story-content", "p"]:
            section = soup.select(selector)
            if section:
                content = " ".join([p.get_text(" ", strip=True) for p in section])
                break

        if title and content:
            return {
                "title": title,
                "content": content,
                "link": url,
                "source": "SCRAPER",
                "published_at": datetime.utcnow(),
            }

    except Exception as e:
        print(f"Scrape error for {url}: {e}")

    return None


In [31]:
def scrape_news_site(homepage_url, limit=20):
    """
    Extract links from homepage and scrape each article.
    """
    print(f"Extracting links from: {homepage_url}")
    links = extract_links(homepage_url, limit=limit)
    print(f"Found {len(links)} potential article links")

    articles = []

    for link in links:
        article = scrape_article(link)
        if article:
            articles.append(article)

    print(f"Scraped {len(articles)} articles successfully")
    return articles


In [32]:
scraper_pages = [
    "https://www.reuters.com/markets/",
]

scraped_articles = []
for url in scraper_pages:
    scraped_articles.extend(scrape_news_site(url, limit=25))

len(scraped_articles)


Extracting links from: https://www.reuters.com/markets/
Found 0 potential article links
Scraped 0 articles successfully


0

In [33]:
pages = [
    "https://www.reuters.com/markets/",
    "https://www.cnbc.com/markets/",
    "https://www.business-standard.com/markets",
]

all_scraped = []
for p in pages:
    all_scraped.extend(scrape_news_site(p, limit=20))

len(all_scraped)


Extracting links from: https://www.reuters.com/markets/
Found 0 potential article links
Scraped 0 articles successfully
Extracting links from: https://www.cnbc.com/markets/
Found 14 potential article links
Scraped 0 articles successfully
Extracting links from: https://www.business-standard.com/markets
Found 0 potential article links
Scraped 0 articles successfully


0

In [37]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import pandas as pd


In [38]:
def extract_links(url, base=None):
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(resp.text, "html.parser")

        links = []
        for a in soup.find_all("a", href=True):
            href = a["href"]

            # Ignore useless links
            if not any(keyword in href for keyword in [
                "article", "story", "/markets/", "/business/",
                "/economy/", "/finance/", "/world/"
            ]):
                continue

            # Make absolute URL
            full_url = urljoin(base or url, href)
            links.append(full_url)

        return list(set(links))  # remove duplicates

    except Exception as e:
        print("Error while extracting links:", e)
        return []


In [40]:
def scrape_article(url):
    try:
        resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(resp.text, "html.parser")

        # Title
        title = soup.find("h1")
        if not title:
            return None

        title = title.get_text(strip=True)

        # Article body
        paragraphs = soup.find_all("p")
        content = " ".join(p.get_text(strip=True) for p in paragraphs)

        # Minimum length filter
        if len(content) < 200:
            return None

        return {
            "title": title,
            "content": content,
            "link": url,
            "source": "SCRAPER",
            "published_at": datetime.utcnow()
        }

    except Exception as e:
        print("Scraping Error:", e)
        return None


In [42]:
def scrape_from_homepage(homepage_url):
    print("Extracting links from:", homepage_url)
    links = extract_links(homepage_url)

    print(f"Found {len(links)} potential article links")

    articles = []
    for link in links:
        article = scrape_article(link)
        if article:
            articles.append(article)

    print(f"Scraped {len(articles)} full articles")
    return articles


In [43]:
scraper_pages = [
    "https://www.reuters.com/markets/",
    "https://www.reuters.com/business/",
    "https://www.reuters.com/world/"
]

scraped_articles = []

for page in scraper_pages:
    scraped_articles.extend(scrape_from_homepage(page))

len(scraped_articles)


Extracting links from: https://www.reuters.com/markets/
Found 0 potential article links
Scraped 0 full articles
Extracting links from: https://www.reuters.com/business/
Found 0 potential article links
Scraped 0 full articles
Extracting links from: https://www.reuters.com/world/
Found 0 potential article links
Scraped 0 full articles


0

In [48]:
import feedparser
from datetime import datetime

def reuters_rss():
    url = "https://feeds.reuters.com/reuters/businessNews"
    feed = feedparser.parse(url)

    articles = []
    for entry in feed.entries:
        articles.append({
            "title": entry.title,
            "content": entry.summary,
            "link": entry.link,
            "source": "ReutersRSS",
            "published_at": datetime.utcnow()
        })
    
    return articles


In [50]:
reuters_articles = reuters_rss()
len(reuters_articles), reuters_articles[:2]


(0, [])