In [1]:
!pip install pandas-market-calendars
!pip install yfinance
!pip install pyspark
!pip install spark-nlp==4.1.0



In [2]:
# utilities
import os
import re
import csv
import numpy as np
import pandas as pd
import string
import sqlite3
import requests
from datetime import datetime, time, date, timedelta

# market Data management
import yfinance as yf
import holidays
import pandas_market_calendars as mcal

# Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Get Stock Info

In [3]:
SCREENER = { 
    "india" : "India"
}

stocks_list = []

for types in ["", "\"futures\""]:
    for x, _ in SCREENER.items():
#         print(f"Loading screener: {x}")
        r = requests.post(f"https://scanner.tradingview.com/{x}/scan",
                          data=f'{{"symbols":{{"tickers":[],"query":{{"types":[{types}]}}}},"columns":["description"]}}')
        for res in r.json()["data"]:
            name = res['d'][0]
            exchange, symbol = res["s"].split(":")
            desc = res["d"][0]
            stocks_list.append({'name':name,'exchange': exchange, 'symbol':symbol})
   
stocks_df = pd.DataFrame(stocks_list)
stocks_df = stocks_df.loc[stocks_df['exchange'] == 'NSE']

In [4]:
 ## content will be turned into lower case before searching
stock_info = {
    'tata motors' : 'TATAMOTORS.NS', 
    'reliance' : 'RELIANCE.NS',
    'hdfc bank': 'HDFCBANK.NS',
    'bajaj auto': 'BAJAJ-AUTO.NS',
    'maruti suzuki': 'MARUTI.NS',
    'hero motocorp': 'HEROMOTOCO.NS',
    'britannia': 'BRITANNIA.NS',
    'nestle': 'NESTLEIND.NS',
    'ntpc': 'NTPC.NS',
    'titan': 'TITAN.NS',
    'cipla': 'CIPLA.NS',
    'ongc': 'ONGC.NS',
    'tcs': 'TCS.NS',
    'l&t': 'LT.NS',
    'coal india': 'COALINDIA.NS',
    'kotak mahindra bank': 'KOTAKBANK.NS',
    ' itc ': 'ITC.NS',
    'bajaj finance': 'BAJFINANCE.NS',
    'hdfc life': 'HDFCLIFE.NS',
    'wipro': 'WIPRO.NS',
    'tata steel': 'TATASTEEL.NS',
    'induslnd bank': 'INDUSINDBK.NS',
    'bajaj finserv': 'BAJAJFINSV.NS',
    'ultratech cement': 'ULTRACEMCO.NS',
    'airtel': 'BHARTIARTL.NS',
    'hindalco': 'HINDALCO.NS',
    'shree cement': 'SHREECEM.NS',
    'tech mahindra': 'TECHM.NS'}

### Load Articles 

In [5]:
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'data'))

In [6]:
ARTICLES_UPDATED = False

In [7]:
if not ARTICLES_UPDATED:
    df = pd.read_csv(os.path.join(DATA_DIR,'article_data_raw.csv'))
    article_data = pd.DataFrame()
    
    for i in range(0,len(df),5):
        row = {'Datetime' :[ df.iloc[i+1,1]],
               'Title' : [df.iloc[i,1]],
               'Description' : [df.iloc[i+2,1]],
               'Content' : [df.iloc[i+3,1]]}
        df_row = pd.DataFrame(row)

        article_data = pd.concat([article_data,df_row], ignore_index = True)
    
    # change content to lower case    
    article_data['Content'] = article_data['Content'].astype(str).str.lower()
    article_data.to_csv(os.path.join(DATA_DIR,'article_data.csv'),index=False)

In [8]:
df_main = pd.read_csv(os.path.join(DATA_DIR,'article_data.csv'))
df_main

Unnamed: 0,Datetime,Title,Description,Content
0,2019-11-27 09:40:00+00:00,Maharashtra government formation: Uddhav Thack...,"Thackeray, to be sworn-in as CM on Thursday, w...",maharashtra cm-designate and shiv sena chief u...
1,2019-11-26 15:24:00+00:00,Maharashtra government formation | Ajit Pawar ...,The resignation comes an hour before Maharasht...,days after he was sworn-in as the deputy chief...
2,2019-11-27 22:25:00+00:00,"Aaditya Thackeray meets Sonia Gandhi, invites ...",Aaditya Thackeray had a brief meeting with Gan...,shiv sena leader aaditya thackeray on november...
3,2019-11-29 15:17:00+00:00,PM Modi announces $450 million line of credit ...,The talks focused on issues like fulfilling as...,prime minister narendra modi announced on frid...
4,2019-11-28 18:50:00+00:00,Manchester City owners acquire ISL team Mumbai...,"With acquisition of Mumbai City FC, City Footb...",the owners of premier league champions manches...
...,...,...,...,...
682,2020-01-13 15:56:00+00:00,5 things you should know about BS VI-compliant...,The company is offering two new colours with t...,royal enfield made its entry in the bs vi-comp...
683,2020-01-18 17:03:00+00:00,This week in Auto: Bajaj Chetak launced; Compa...,Here is a complete round-up of the big news fr...,bajaj finally made the chetak available again ...
684,2020-01-21 09:05:00+00:00,Secure credit and debit card usage: How RBI's ...,These are good steps towards curbing card frau...,using your credit and debit cards is about to ...
685,2020-01-13 22:38:00+00:00,Tesla offers $1 million to white hat hackers i...,While a million dollars may seem like a substa...,tesla has been heavily investing in cybersecur...


In [9]:
def find_stock_in_content(content):
    content = str(content)
    for stock in stock_info.keys():
        if stock in content:
            return stock_info[stock]

### Get stock movement after news

In [10]:
bse = mcal.get_calendar('BSE')
holidays = bse.holidays().holidays

def next_trading_date(date):
    date += timedelta(1)
    while (date in holidays) or (date.isoweekday() > 5):
        date += timedelta(1)
    return date

In [11]:
def get_change(ticker,dt):
    
    srt_date = dt.date()
    start_time = dt.time()
    
    # if after market
    if(start_time > time(15,0,0)):
        srt_date = next_trading_date(srt_date)
        start_time = time(9,30,0)
    
    # before market open
    elif(start_time < time(9,30,0)):
        srt_date = next_trading_date(srt_date - timedelta(1))
        start_time = time(9,30,0)
    
    # during market hours and market closed 
    elif(srt_date != next_trading_date(srt_date - timedelta(1))):
        start_date = next_trading_date(srt_date)
        start_time = time(9,30,0)
        
#     print(srt_date, start_time)
    data = yf.download(tickers=ticker, start=srt_date,end=srt_date+timedelta(1),start_time=start_time,period='1d')

    if data.empty:
        return np.nan
    change = ((data['Adj Close'][0] - data['Open'][0]) / data['Open'][0]) * 100
    return round(change,2)

In [12]:
# Add change percentage of stock after news
CHANGE_CSV = False

if not CHANGE_CSV:
  change = []
  for i in range (len(df_main)):
      row = df_main.iloc[i,:]
      dt = datetime.strptime(row['Datetime'], '%Y-%m-%d %H:%M:%S%z')
      content = row['Content']
      
      ticker = find_stock_in_content(content)
      if ticker != None:
          change.append(get_change(ticker,dt))
      else :
          change.append(np.nan)

  df_main['Change'] = change
  df_main.to_csv(os.path.join(DATA_DIR,'change_percent_data.csv'),index=False)
  

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- TATAMOTORS.NS: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*******

Processing steps -
*   cleaning
*   tokenization
*   stop word removal 
*   lemmetization
*   stemming

In [13]:
df_main = pd.read_csv(os.path.join(DATA_DIR,'change_percent_data.csv'))

In [14]:
df_main

Unnamed: 0,Datetime,Title,Description,Content,Change
0,2019-11-27 09:40:00+00:00,Maharashtra government formation: Uddhav Thack...,"Thackeray, to be sworn-in as CM on Thursday, w...",maharashtra cm-designate and shiv sena chief u...,
1,2019-11-26 15:24:00+00:00,Maharashtra government formation | Ajit Pawar ...,The resignation comes an hour before Maharasht...,days after he was sworn-in as the deputy chief...,
2,2019-11-27 22:25:00+00:00,"Aaditya Thackeray meets Sonia Gandhi, invites ...",Aaditya Thackeray had a brief meeting with Gan...,shiv sena leader aaditya thackeray on november...,
3,2019-11-29 15:17:00+00:00,PM Modi announces $450 million line of credit ...,The talks focused on issues like fulfilling as...,prime minister narendra modi announced on frid...,
4,2019-11-28 18:50:00+00:00,Manchester City owners acquire ISL team Mumbai...,"With acquisition of Mumbai City FC, City Footb...",the owners of premier league champions manches...,
...,...,...,...,...,...
682,2020-01-13 15:56:00+00:00,5 things you should know about BS VI-compliant...,The company is offering two new colours with t...,royal enfield made its entry in the bs vi-comp...,
683,2020-01-18 17:03:00+00:00,This week in Auto: Bajaj Chetak launced; Compa...,Here is a complete round-up of the big news fr...,bajaj finally made the chetak available again ...,-11.59
684,2020-01-21 09:05:00+00:00,Secure credit and debit card usage: How RBI's ...,These are good steps towards curbing card frau...,using your credit and debit cards is about to ...,
685,2020-01-13 22:38:00+00:00,Tesla offers $1 million to white hat hackers i...,While a million dollars may seem like a substa...,tesla has been heavily investing in cybersecur...,


### 1. Cleaning

In [15]:
def clean_content(content):
    content = str(content)
    
    # remove all the punctuations
    content = re.sub(r'[^\w\s]', '', content)
    
    # remove all numbers
    content = content.translate(str.maketrans('','',string.digits))
    
    # remove double spacing
    content = re.sub(' +', ' ', content)
    
    return content

In [16]:
# Clean contents
df_main.dropna(inplace=True)
df_main['Content'] = df_main['Content'].map((lambda c: clean_content(c)),na_action=None)

### 2.Tokenization

In [17]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
df_main['Content'] = df_main['Content'].map((lambda c: nltk.word_tokenize(c)),na_action=None)

### 3. Stop word removal

In [19]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stop_words(content):
      return [x for x in content if x not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
df_main['Content'] = df_main['Content'].map((lambda c: remove_stop_words(c)),na_action=None)

### 4. Lemmatization

In [21]:
nltk.download('omw-1.4')
def lemmatize(content):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in content]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
df_main['Content'] = df_main['Content'].map((lambda c: lemmatize(c)),na_action=None)

In [23]:
## convert lemmaatised list into comma seperated string
def list_to_string(ls):
    string = ''
    for word in ls:
        string += word + ','
    return string.rstrip(',')

df_main['Content'] = df_main['Content'].map((lambda c: list_to_string(c)),na_action=None)

In [24]:
df_main = df_main.reset_index().drop(columns=['index'])

In [25]:
df_main.isnull().any()


Datetime       False
Title          False
Description    False
Content        False
Change         False
dtype: bool

In [26]:
df_main.to_csv(os.path.join(DATA_DIR,'preprocessed_data.csv'),index=False,mode='a')