In [None]:
!pip install pandas-market-calendars
!pip install yfinance
!pip install pyspark
!pip install spark-nlp==4.1.0

In [1]:
# utilities
import re
import csv
import numpy as np
import pandas as pd
import string
import sqlite3
import requests
from datetime import datetime, time, date, timedelta

# market Data management
import yfinance as yf
import holidays
import pandas_market_calendars as mcal

# Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Get Stock Info

In [2]:
SCREENER = { 
    "india" : "India"
}

stocks_list = []

for types in ["", "\"futures\""]:
    for x, _ in SCREENER.items():
#         print(f"Loading screener: {x}")
        r = requests.post(f"https://scanner.tradingview.com/{x}/scan",
                          data=f'{{"symbols":{{"tickers":[],"query":{{"types":[{types}]}}}},"columns":["description"]}}')
        for res in r.json()["data"]:
            name = res['d'][0]
            exchange, symbol = res["s"].split(":")
            desc = res["d"][0]
            stocks_list.append({'name':name,'exchange': exchange, 'symbol':symbol})
   
stocks_df = pd.DataFrame(stocks_list)
stocks_df = stocks_df.loc[stocks_df['exchange'] == 'NSE']

In [3]:
 ## content will be turned into lower case before searching
stock_info = {
    'tata motors' : 'TATAMOTORS.NS', 
    'reliance' : 'RELIANCE.NS',
    'hdfc bank': 'HDFCBANK.NS'
}

### Load Articles 

In [42]:
ARTICLES_UPDATED = True

In [46]:
if not ARTICLES_UPDATED:
    df = pd.read_csv('article_data.csv')
    article_data = pd.DataFrame()
    
    for i in range(0,len(df),5):
        row = {'Datetime' :[ df.iloc[i+1,1]],
               'Title' : [df.iloc[i,1]],
               'Description' : [df.iloc[i+2,1]],
               'Content' : [df.iloc[i+3,1]]}
        df_row = pd.DataFrame(row)

        article_data = pd.concat([article_data,df_row], ignore_index = True)
    
    article_data.to_csv('updated_data.csv',index=False)

In [54]:
df_main = pd.read_csv('updated_data.csv')

# change content to lower case 
df_main['Content'] = df_main['Content'].astype(str).str.lower()

In [55]:
def find_stock_in_content(content):
  content = str(content)
  for stock in stock_info.keys():
      if stock in content:
          return stock_info[stock]

### Get stock movement after news

In [56]:
bse = mcal.get_calendar('BSE')
holidays = bse.holidays().holidays

def next_trading_date(date):
    date += timedelta(1)
    while (date in holidays) or (date.isoweekday() > 5):
        date += timedelta(1)
    return date

In [57]:
def get_change(ticker,dt):
    
    srt_date = dt.date()
    start_time = dt.time()
    
    # if after market
    if(start_time > time(15,0,0)):
        srt_date = next_trading_date(srt_date)
        start_time = time(9,30,0)
    
    # before market open
    elif(start_time < time(9,30,0)):
        srt_date = next_trading_date(srt_date - timedelta(1))
        start_time = time(9,30,0)
    
    # during market hours and market closed 
    elif(srt_date != next_trading_date(srt_date - timedelta(1))):
        start_date = next_trading_date(srt_date)
        start_time = time(9,30,0)
        
#     print(srt_date, start_time)
    data = yf.download(tickers=ticker, start=srt_date,end=srt_date+timedelta(1),start_time=start_time,period='1d')
    
    if data.empty:
        return np.nan
    return ((data['Adj Close'][0] - data['Open'][0]) / data['Open'][0]) * 100

In [61]:
# Add change percentage of stock after news
CHANGE_CSV = True

if not CHANGE_CSV:
  change = []
  for i in range (len(df_main)):
      row = df_main.iloc[i,:]
      dt = datetime.strptime(row['Datetime'], '%Y-%m-%d %H:%M:%S%z')
      content = row['Content']
      
      ticker = find_stock_in_content(content)
      if ticker != None:
          change.append(get_change(ticker,dt))
      else :
          change.append(np.nan)

  df_main['Change'] = change
  df_main.to_csv('change.csv',index=False)
  

Processing steps -
*   cleaning
*   tokenization
*   stop word removal 
*   lemmetization
*   stemming

In [63]:
df_main = pd.read_csv('change.csv')

### 1. Cleaning

In [64]:
def clean_content(content):
    content = str(content)
    
    # remove all the punctuations
    content = re.sub(r'[^\w\s]', '', content)
    
    # remove all numbers
    content = content.translate(str.maketrans('','',string.digits))
    
    # remove double spacing
    content = re.sub(' +', ' ', content)
    
    return content

In [65]:
# Clean contents
df_main['Content'] = df_main['Content'].map((lambda c: clean_content(c)),na_action=None)

### 2.Tokenization

In [66]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [67]:
df_main['Content'] = df_main['Content'].map((lambda c: nltk.word_tokenize(c)),na_action=None)

### 3. Stop word removal

In [68]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stop_words(content):
      return [x for x in content if x not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
df_main['Content'] = df_main['Content'].map((lambda c: remove_stop_words(c)),na_action=None)

### 4. Lemmatization

In [70]:
nltk.download('omw-1.4')
def lemmatize(content):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in content]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [71]:
df_main['Content'] = df_main['Content'].map((lambda c: lemmatize(c)),na_action=None)

In [72]:
## convert lemmaatised list into comma seperated string
def list_to_string(ls):
    string = ''
    for word in ls:
        string += word + ','
    return string.rstrip(',')

df_main['Content'] = df_main['Content'].map((lambda c: list_to_string(c)),na_action=None)

In [75]:
df_main.head()

Unnamed: 0,Datetime,Title,Description,Content,change,Change
0,2019-01-01 20:56:00+00:00,Car sales languish as record discounts fail to...,"Tight liquidity, increased lending rates, poor...","nonetight,liquidity,increased,lending,rate,poo...",-1.201643,-1.201643
1,2019-01-01 18:07:00+00:00,"Tata Motors domestic sales fall 8% to 50,440 u...",The company said its commercial vehicle sales ...,"nonethe,company,passenger,vehicle,sale,domesti...",-1.201643,-1.201643
2,2019-01-02 09:21:00+00:00,Mutual fund performance review: The best and w...,The IT sector has been riding on good earnings...,"scheme,investing,technology,fund,saviour,inves...",,
3,2019-01-01 16:19:00+00:00,Congress president Rahul Gandhi to visit Ameth...,District Congress president Yogendra Mishra sa...,"congress,president,rahul,gandhi,twoday,tour,pa...",,
4,2019-01-02 11:03:00+00:00,Gone in 6 missed calls: Mumbai businessman che...,The money was reportedly transferred to 14 acc...,"nonethe,victim,v,shah,received,six,missed,call...",,


In [76]:
df_main.to_csv('processed.csv',index=False)