In [33]:
import duckdb
import pandas as pd
import tqdm

# import concurrent.futures
from tqdm.contrib.concurrent import thread_map

In [34]:
import logging

logger = logging.getLogger("my_logger")
logger.setLevel("WARNING")
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s [%(levelname)s] : %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)

## Getting data from database

In [35]:
conn = duckdb.connect("../datasets/ticker_data.db")

In [36]:
articles = conn.execute(
    """with ac AS(
                select date_posted, ticker, source, article_link, 
                    row_number() over (
                        partition by source order by date_posted desc
                    ) as row_number 
                from article_data
            ) 
            select ticker, source, article_link, date_posted
            from ac 
            where row_number <= 2
        """
).df()
articles.head()

Unnamed: 0,ticker,source,article_link,date_posted
0,HDFCBANK,Cafemutual,https://cafemutual.com/news/industry/32988-mee...,2024-09-03 11:33:47
1,HDFCBANK,Cafemutual,https://cafemutual.com/news/industry/32988-mee...,2024-09-03 11:33:13
2,BPCL,ET EnergyWorld,https://energy.economictimes.indiatimes.com/ne...,2024-08-30 15:49:25
3,BPCL,ET EnergyWorld,https://energy.economictimes.indiatimes.com/ne...,2024-08-30 15:30:15
4,LT,Firstpost,https://www.firstpost.com/tech/lt-to-set-up-30...,2024-09-11 11:33:30


In [37]:
# unique sources
sources = articles["source"].unique()
print(f"Unique Sources: {len(sources)}")

Unique Sources: 221


In [38]:
# many tickers refer to same article hence removing the duplicates
articles_link = articles.article_link.drop_duplicates(ignore_index=True)
print(f"Unique article links: {len(articles_link)}")

Unique article links: 273


## testing the modules

### Using Newspaper4k

In [39]:
import newspaper
import newspaper.mthreading as npr

In [40]:
# get 2 article links for each source, fetch the article content using the specified library and store it in a dictionary with source as key


def get_article_content(source):

    logging.info(f"processing {source}")

    article_attrs = [
        "title",
        "url",
        "summary",
        "text_cleaned",
        "publish_date",
        "authors",
    ]

    source_articles = articles.loc[
        articles.loc[:, "source"] == source
    ].article_link.to_list()

    try:
        result = npr.fetch_news(source_articles)
        return {
            source: [
                {attr: getattr(article_obj, attr) for attr in article_attrs}
                for article_obj in result
            ]
        }
    except Exception as e:
        logging.warning(f"Couldn't fetch data for {source}. \nException: {e}")

In [41]:
source_articles = {}
# with concurrent.futures.ThreadPoolExecutor() as executor:

results = thread_map(get_article_content, sources)

for source_articles_dict in results:
    if source_articles_dict is not None:
        source_articles.update(source_articles_dict)

  0%|          | 0/221 [00:00<?, ?it/s]

Exception: Article `download()` failed with Status code 404 for url None on URL https://www.carandbike.com/videos/hero-mavrick-440-vs-bajaj-dominar-400-poweplay-comparison-review-carandbike-735787
Exception: Article `download()` failed with Status code 403 for url None on URL https://www.exchange4media.com/people-movement-news/britannias-neha-more-joins-ferrero-as-business-head-modern-trade-137219.html
Exception: Article `download()` failed with HTTPSConnectionPool(host='www.tipranks.com', port=443): Max retries exceeded with url: /legal/block (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x30c27a5f0>: Failed to establish a new connection: [Errno 61] Connection refused')) on URL https://www.tipranks.com/news/company-announcements/dr-reddys-engages-investors-in-upcoming-conferences-3
Exception: Article `download()` failed with Status code 403 for url None on URL https://www.wsj.com/livecoverage/stock-market-today-dow-sp500-nasdaq-live-09-19-2024
Exception:

In [42]:
source_articles["Financial Times"]

[{'title': 'Infosys and Posti Extend Strategic Collaboration – Company Announcement',
  'url': 'https://markets.ft.com/data/announce/detail?dockey=600-202409190617PR_NEWS_EURO_ND__EN10652-1',
  'summary': '',
  'text_cleaned': 'To help Posti enhance customer experience and operational efficiency with an AI-first strategy leveraging Infosys Topaz\n\nBENGALURU, India, Sept. 19, 2024 /PRNewswire/ -- Infosys (NSE: INFY) (BSE: INFY) (NYSE: INFY), a global leader in next-generation digital services and consulting, today announced it is extending its strategic collaboration with Posti, the leading delivery and logistics services provider in Finland, Sweden, and the Baltics. As part of the engagement, Infosys will help Posti enhance customer experience and operational efficiency while continuing to innovate, scale, and grow its IT operations.\n\nInfosys will adopt an AI-driven approach powered by Infosys Topaz, an AI-first offering using generative AI technologies, to empower Posti with operat

---

## News Please

In [45]:
from newsplease import NewsPlease

In [43]:
def get_article_content_np(source_name: str) -> dict:
    """fetch n articles for source from database. Parse the articles using NewsPlease and return the results."""

    articles_list = []

    article_attrs = [
        "title",
        "url",
        "description",
        "maintext",
        "date_publish",
        "authors",
    ]

    source_articles = articles[articles["source"] == source_name].article_link.to_list()
    results = NewsPlease.from_urls(source_articles)

    for _, art_obj in results.items():
        if not type(art_obj) == dict:
            articles_list.append(
                {attr: getattr(art_obj, attr) for attr in article_attrs}
            )

    return {source_name: articles_list}

In [1]:
header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"
        }

---

In [3]:
import requests

In [4]:
news_url = 'https://iinvest.cogencis.com/ine059a01026/cipl/ns/cipla/cipla_?tab=news'

In [2]:
headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
    'Sec-Fetch-User': '?1',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-Mode': 'navigate',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8',
}

In [9]:
output = requests.get(news_url,headers=headers)

KeyboardInterrupt: 

In [10]:
s =requests.Session()

In [11]:
output = s.get('https://nseindia.com',headers=headers)

In [13]:
output.content

b'<!DOCTYPE html>\r\n<html lang="en">\r\n  <head>\r\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\r\n<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">\r\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />\r\n<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" />\r\n<title>\r\n    NSE - National Stock Exchange of India Ltd: Live Share/Stock Market News &amp; Updates, Quotes- Nseindia.com\r\n</title>\r\n<meta name="description" content="NSE India (National Stock Exchange of India Ltd) \xe2\x80\x93 LIVE Share/Stock Market Updates Today. Get all latest share market news, live charts, analysis, ipo, stock/share tips, indices, equity, currency and commodity market, derivatives, finance, budget, mutual fund, bond and corporate announcements more on NSEindia.com." />\r\n<meta name="keywords" content="NSE, National Stock Exchange, NSE India, Stock Exchange in India, stock exch

In [14]:
output = s.get(news_url,headers=headers)

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [16]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(news_url, headers=headers, timeout=30)

ReadTimeout: HTTPSConnectionPool(host='iinvest.cogencis.com', port=443): Read timed out. (read timeout=30)