Imports

In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from pprint import pprint
from nsepython import *
from datetime import datetime
import time

Get Company Tickers from [NSE Website](https://www1.nseindia.com/content/indices/ind_nifty500list.csv)

In [2]:
nifty_500_ticker_url = 'https://www1.nseindia.com/content/indices/ind_nifty500list.csv'
nifty_50_ticker_url = 'https://www1.nseindia.com/content/indices/ind_nifty50list.csv'
tickers_file = pd.read_csv(nifty_50_ticker_url)
tickers_df = tickers_file[['Symbol', 'Company Name']]
tickers = tickers_df['Symbol']
tickers.head()

0      ADANIENT
1    ADANIPORTS
2    APOLLOHOSP
3    ASIANPAINT
4      AXISBANK
Name: Symbol, dtype: object

In [3]:
outliers = ['M&M']

In [5]:
tickers

0       ADANIENT
1     ADANIPORTS
2     APOLLOHOSP
3     ASIANPAINT
4       AXISBANK
5     BAJAJ-AUTO
6     BAJFINANCE
7     BAJAJFINSV
8           BPCL
9     BHARTIARTL
10     BRITANNIA
11         CIPLA
12     COALINDIA
13      DIVISLAB
14       DRREDDY
15     EICHERMOT
16        GRASIM
17       HCLTECH
18      HDFCBANK
19      HDFCLIFE
20    HEROMOTOCO
21      HINDALCO
22    HINDUNILVR
23          HDFC
24     ICICIBANK
25           ITC
26    INDUSINDBK
27          INFY
28      JSWSTEEL
29     KOTAKBANK
30            LT
31           M&M
32        MARUTI
33          NTPC
34     NESTLEIND
35          ONGC
36     POWERGRID
37      RELIANCE
38       SBILIFE
39          SBIN
40     SUNPHARMA
41           TCS
42    TATACONSUM
43    TATAMOTORS
44     TATASTEEL
45         TECHM
46         TITAN
47           UPL
48    ULTRACEMCO
49         WIPRO
Name: Symbol, dtype: object

In [4]:
np.setdiff1d(tickers, outliers)

array(['ADANIENT', 'ADANIPORTS', 'APOLLOHOSP', 'ASIANPAINT', 'AXISBANK',
       'BAJAJ-AUTO', 'BAJAJFINSV', 'BAJFINANCE', 'BHARTIARTL', 'BPCL',
       'BRITANNIA', 'CIPLA', 'COALINDIA', 'DIVISLAB', 'DRREDDY',
       'EICHERMOT', 'GRASIM', 'HCLTECH', 'HDFC', 'HDFCBANK', 'HDFCLIFE',
       'HEROMOTOCO', 'HINDALCO', 'HINDUNILVR', 'ICICIBANK', 'INDUSINDBK',
       'INFY', 'ITC', 'JSWSTEEL', 'KOTAKBANK', 'LT', 'MARUTI',
       'NESTLEIND', 'NTPC', 'ONGC', 'POWERGRID', 'RELIANCE', 'SBILIFE',
       'SBIN', 'SUNPHARMA', 'TATACONSUM', 'TATAMOTORS', 'TATASTEEL',
       'TCS', 'TECHM', 'TITAN', 'ULTRACEMCO', 'UPL', 'WIPRO'],
      dtype=object)

Scrape Article Headlines and Dates

In [None]:
news_url = 'https://ticker.finology.in/company/'

In [None]:
# list to store article data
data = []
companies_len = len(tickers)
for i in range(270):
    print(i)
    req = Request(url= '{}/{}'.format(news_url, tickers[i]),headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})
    response = urlopen(req)
    html = BeautifulSoup(response) 
    news_links = html.select('.newslink')  
    for link in news_links:
        title = link.find('span', class_='h6').text
        #separate date and time from datetime object
        date_time_obj = datetime.strptime(link.find('small').text, '%d %b %Y, %I:%M%p')
        art_date = date_time_obj.date().strftime('%Y/%m/%d')
        art_time = date_time_obj.time().strftime('%H:%M')
        data.append([tickers[i], title, art_date, art_time])  
    '''if (i != 0 and i%200 == 0):
        print('sleeping')
        time.sleep(30)'''
df = pd.DataFrame(data, columns=['Ticker', 'Headline', 'Date', 'Time'])

In [None]:
if 4//200:
    print('hello')    

In [None]:
pprint(df)

Sentiment Analysis

In [None]:
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
vader = SentimentIntensityAnalyzer()
scores = df['Headline'].apply(vader.polarity_scores).tolist()
scores_df = pd.DataFrame(scores)

In [None]:
new_df = pd.merge(left=df, right=scores_df, on=df.index.values).drop(['key_0'], axis=1)

In [None]:
new_df

In [None]:
final_df = new_df.groupby('Ticker').mean()

In [None]:
final_df

Get Company Sector, industry data

In [None]:
sector = []
industry = []
mCap = []
for i in range(270):
    print(tickers[i])
    meta = nse_eq(tickers[i])
    print(tickers[i])
    sector.append(meta['industryInfo']['macro'])
    pprint('Sector: {}'.format(meta['industryInfo']['macro']))
    industry.append(meta['industryInfo']['sector'])
    pprint('Industry: {}'.format(meta['industryInfo']['sector']))
    ticker_mcap = round((meta['priceInfo']['previousClose'] * meta['securityInfo']['issuedSize'])/1000000000, 2)
    mCap.append(ticker_mcap)
    print('market cap is Rs {}'.format(ticker_mcap))
    print('\n')

final_df['sector'] = sector
final_df['industry'] = industry
final_df['mCap (Billion)'] = mCap

In [None]:
pprint(nse_eq('APOLLOHOSP'))

In [None]:
final_df = final_df.reset_index()

In [None]:
final_df

In [None]:
final_df = pd.merge(final_df, tickers_df, left_on='Ticker', right_on='Symbol').drop('Symbol', axis=1)

In [None]:
final_df.columns = ['Symbol', 'Negative', 'Neutral', 'Positive', 'Sentiment Score', 'Sector', 'Industry', 'MCap (Billion)', 'Company Name']

In [None]:
final_df

Plotting

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px

In [None]:
fig = px.treemap(
    final_df, path=[px.Constant('Nifty 500'), 'Sector', 'Industry', 'Symbol'], values='MCap (Billion)', color='Sentiment Score',
    hover_data=['Company Name', 'Negative', 'Neutral', 'Positive', 'Sentiment Score'], color_continuous_scale=['#FF0000', "#000000", '#00FF00'], color_continuous_midpoint=0
    )
fig.data[0].customdata = final_df[['Company Name', 'Negative', 'Neutral', 'Positive', 'Sentiment Score']]
fig.data[0].texttemplate = "%{label}<br>%{customdata[4]}"
fig.update_traces(textposition="middle center")
fig.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)
fig.show()

In [None]:
'''
req = Request(url= '{}/{}'.format(news_url, tickers[0]),headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})
response = urlopen(req)
html = BeautifulSoup(response)
data = [] 
news_links = html.select('.newslink')
for link in news_links:
    title = link.find('span', class_='h6').text
    date = link.find('small').text
    data.append([title, date])

df = pd.DataFrame(data, columns=['Title', 'Date'])

print(df)
'''

References

https://blog.devgenius.io/best-way-to-speed-up-a-bulk-of-http-requests-in-python-4ec75badabed