In [1]:
!pip install pandas-market-calendars
!pip install yfinance
!pip install pyspark
!pip install spark-nlp==4.1.0



In [1]:
# utilities
import os
import re
import csv
import numpy as np
import pandas as pd
import string
import sqlite3
import requests
from datetime import datetime, time, date, timedelta

# market Data management
import yfinance as yf
import holidays
import pandas_market_calendars as mcal

# Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Get Stock Info

In [2]:
SCREENER = { 
    "india" : "India"
}

stocks_list = []

for types in ["", "\"futures\""]:
    for x, _ in SCREENER.items():
#         print(f"Loading screener: {x}")
        r = requests.post(f"https://scanner.tradingview.com/{x}/scan",
                          data=f'{{"symbols":{{"tickers":[],"query":{{"types":[{types}]}}}},"columns":["description"]}}')
        for res in r.json()["data"]:
            name = res['d'][0]
            exchange, symbol = res["s"].split(":")
            desc = res["d"][0]
            stocks_list.append({'name':name,'exchange': exchange, 'symbol':symbol})
   
stocks_df = pd.DataFrame(stocks_list)
stocks_df = stocks_df.loc[stocks_df['exchange'] == 'NSE']

In [3]:
 ## content will be turned into lower case before searching
stock_info = {
    'tata motors' : 'TATAMOTORS.NS', 
    'reliance' : 'RELIANCE.NS',
    'hdfc bank': 'HDFCBANK.NS',
    'bajaj auto': 'BAJAJ-AUTO.NS',
    'maruti suzuki': 'MARUTI.NS',
    'hero motocorp': 'HEROMOTOCO.NS',
    'britannia': 'BRITANNIA.NS',
    'nestle': 'NESTLEIND.NS',
    'ntpc': 'NTPC.NS',
    'titan': 'TITAN.NS',
    'cipla': 'CIPLA.NS',
    'ongc': 'ONGC.NS',
    'tcs': 'TCS.NS',
    'l&t': 'LT.NS',
    'coal india': 'COALINDIA.NS',
    'kotak mahindra bank': 'KOTAKBANK.NS',
    ' itc ': 'ITC.NS',
    'bajaj finance': 'BAJFINANCE.NS',
    'hdfc life': 'HDFCLIFE.NS',
    'wipro': 'WIPRO.NS',
    'tata steel': 'TATASTEEL.NS',
    'induslnd bank': 'INDUSINDBK.NS',
    'bajaj finserv': 'BAJAJFINSV.NS',
    'ultratech cement': 'ULTRACEMCO.NS',
    'airtel': 'BHARTIARTL.NS',
    'hindalco': 'HINDALCO.NS',
    'shree cement': 'SHREECEM.NS',
    'tech mahindra': 'TECHM.NS'}

### Load Articles 

In [84]:
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'data'))

In [85]:
ARTICLES_UPDATED = False

In [86]:
if not ARTICLES_UPDATED:
    article_data = pd.read_csv(os.path.join(DATA_DIR,'article_data_raw.csv'),header=None,)
    article_data.rename(columns = {0:'Title',1:'Datetime',2:'Description',3:'Content'}, inplace = True)
        
    # change content to lower case    
    article_data['Content'] = article_data['Content'].astype(str).str.lower()
    article_data.to_csv(os.path.join(DATA_DIR,'article_data.csv'),index=False)


In [87]:
df_main = pd.read_csv(os.path.join(DATA_DIR,'article_data.csv'))
df_main

Unnamed: 0,Title,Datetime,Description,Content
0,Michael Jordan mourns death of 'little brother...,2020-01-27 09:49:00+00:00,"In a statement, Jordan said that Bryant would ...","michael jordan on january 26, mourned the deat..."
1,A tweet from 2012 predicting Kobe Bryant's hel...,2020-01-27 09:19:00+00:00,"Bryant, who retired in 2016, died in a helicop...",nba legend kobe bryant and his 13-year-old dau...
2,"Samsung Galaxy Note10 Lite launched for Rs 38,...",2020-01-21 16:48:00+00:00,"In spite of being a ‘Lite’ variant, the Galaxy...",samsung india on january 21 launched galaxy no...
3,Samsung Galaxy Note 10 Lite vs Galaxy S10 Lite...,2020-01-24 19:24:00+00:00,In case you are finding it tough to decide whi...,samsung recently launched two new ‘lite’ smart...
4,Politics | The many layers to anti-CAA protest...,2020-01-23 15:38:00+00:00,The anti-CAA protests in Assam are steadily pr...,nazimuddin siddiquenonethe protests in assam w...
...,...,...,...,...
655,MSI Prestige 14 Review: A portable powerhouse ...,2020-02-24 08:54:00+00:00,Stick around to find out if this ultra portabl...,are you a creative professional who cannot lea...
656,Citroen to make India debut with C5 Aircross,2020-02-26 07:48:00+00:00,The C5 Aircross is a mid-size SUV that will be...,citroen is gearing up to make its india debut ...
657,SEBI Chairman Ajay Tyagi to complete term on M...,2020-02-25 20:23:00+00:00,"Under his stewardship, the regulator cracked t...",after three years at helm of the securities an...
658,"Poker is a game of skill, has tremendous scope...",2020-02-25 20:36:00+00:00,,the online gaming industry in india is touchin...


In [88]:
def find_stock_in_content(content):
    content = str(content)
    for stock in stock_info.keys():
        if stock in content:
            return stock_info[stock]

### Get stock movement after news

In [89]:
bse = mcal.get_calendar('BSE')
holidays = bse.holidays().holidays

def next_trading_date(date):
    date += timedelta(1)
    while (date in holidays) or (date.isoweekday() > 5):
        date += timedelta(1)
    return date

In [90]:
def get_change(ticker,dt):
    
    srt_date = dt.date()
    start_time = dt.time()
    
    # if after market
    if(start_time > time(15,0,0)):
        srt_date = next_trading_date(srt_date)
        start_time = time(9,30,0)
    
    # before market open
    elif(start_time < time(9,30,0)):
        srt_date = next_trading_date(srt_date - timedelta(1))
        start_time = time(9,30,0)
    
    # during market hours and market closed 
    elif(srt_date != next_trading_date(srt_date - timedelta(1))):
        start_date = next_trading_date(srt_date)
        start_time = time(9,30,0)
        
#     print(srt_date, start_time)
    data = yf.download(tickers=ticker, start=srt_date,end=srt_date+timedelta(1),start_time=start_time,period='1d')

    if data.empty:
        return np.nan
    change = ((data['Adj Close'][0] - data['Open'][0]) / data['Open'][0]) * 100
    return round(change,2)

In [91]:
# Add change percentage of stock after news
CHANGE_CSV = False

if not CHANGE_CSV:
    change = []
    for i in range (len(df_main)):
        row = df_main.iloc[i,:]
        dt = datetime.strptime(row['Datetime'], '%Y-%m-%d %H:%M:%S%z')
        content = row['Content']

        ticker = find_stock_in_content(content)
        if ticker != None:
            change.append(get_change(ticker,dt))
        else :
            change.append(np.nan)


    df_main['Change'] = change
    df_main = df_main[["Datetime",'Title','Description','Content','Change']]
    df_main.to_csv(os.path.join(DATA_DIR,'change_percent_data.csv'),index=False)
  

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- RELIANCE.NS: No data found for this date range, symbol may be delisted
[*********

Processing steps -
*   cleaning
*   tokenization
*   stop word removal 
*   lemmetization
*   stemming

In [92]:
df_main = pd.read_csv(os.path.join(DATA_DIR,'change_percent_data.csv'))

In [93]:
df_main

Unnamed: 0,Datetime,Title,Description,Content,Change
0,2020-01-27 09:49:00+00:00,Michael Jordan mourns death of 'little brother...,"In a statement, Jordan said that Bryant would ...","michael jordan on january 26, mourned the deat...",
1,2020-01-27 09:19:00+00:00,A tweet from 2012 predicting Kobe Bryant's hel...,"Bryant, who retired in 2016, died in a helicop...",nba legend kobe bryant and his 13-year-old dau...,
2,2020-01-21 16:48:00+00:00,"Samsung Galaxy Note10 Lite launched for Rs 38,...","In spite of being a ‘Lite’ variant, the Galaxy...",samsung india on january 21 launched galaxy no...,
3,2020-01-24 19:24:00+00:00,Samsung Galaxy Note 10 Lite vs Galaxy S10 Lite...,In case you are finding it tough to decide whi...,samsung recently launched two new ‘lite’ smart...,
4,2020-01-23 15:38:00+00:00,Politics | The many layers to anti-CAA protest...,The anti-CAA protests in Assam are steadily pr...,nazimuddin siddiquenonethe protests in assam w...,
...,...,...,...,...,...
655,2020-02-24 08:54:00+00:00,MSI Prestige 14 Review: A portable powerhouse ...,Stick around to find out if this ultra portabl...,are you a creative professional who cannot lea...,
656,2020-02-26 07:48:00+00:00,Citroen to make India debut with C5 Aircross,The C5 Aircross is a mid-size SUV that will be...,citroen is gearing up to make its india debut ...,
657,2020-02-25 20:23:00+00:00,SEBI Chairman Ajay Tyagi to complete term on M...,"Under his stewardship, the regulator cracked t...",after three years at helm of the securities an...,
658,2020-02-25 20:36:00+00:00,"Poker is a game of skill, has tremendous scope...",,the online gaming industry in india is touchin...,


### 1. Cleaning

In [94]:
def clean_content(content):
    content = str(content)
    
    # remove all the punctuations
    content = re.sub(r'[^\w\s]', '', content)
    
    # remove all numbers
    content = content.translate(str.maketrans('','',string.digits))
    
    # remove double spacing
    content = re.sub(' +', ' ', content)
    
    return content

In [95]:
# Clean contents
df_main.dropna(inplace=True)
df_main['Content'] = df_main['Content'].map((lambda c: clean_content(c)),na_action=None)

### 2.Tokenization

In [96]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [97]:
df_main['Content'] = df_main['Content'].map((lambda c: nltk.word_tokenize(c)),na_action=None)

### 3. Stop word removal

In [98]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stop_words(content):
      return [x for x in content if x not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [99]:
df_main['Content'] = df_main['Content'].map((lambda c: remove_stop_words(c)),na_action=None)

### 4. Lemmatization

In [100]:
nltk.download('omw-1.4')
def lemmatize(content):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in content]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chhal\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [101]:
df_main['Content'] = df_main['Content'].map((lambda c: lemmatize(c)),na_action=None)

In [102]:
## convert lemmaatised list into comma seperated string
def list_to_string(ls):
    string = ''
    for word in ls:
        string += word + ','
    return string.rstrip(',')

df_main['Content'] = df_main['Content'].map((lambda c: list_to_string(c)),na_action=None)

In [103]:
df_main = df_main.reset_index().drop(columns=['index'])

In [104]:
df_main.isnull().any()


Datetime       False
Title          False
Description    False
Content        False
Change         False
dtype: bool

In [106]:
df_main.to_csv(os.path.join(DATA_DIR,'preprocessed_data.csv'),index=False,mode='a',header=False)

In [107]:
df_main

Unnamed: 0,Datetime,Title,Description,Content,Change
0,2020-01-22 20:15:00+00:00,"Tata Altroz launched at Rs 5.29 lakh, cheaper ...","Besides Baleno, Altroz will compete against th...","nonethe,diesel,version,altroz,priced,r,lakh,to...",0.88
1,2020-01-22 09:00:00+00:00,"Preview: Tata Altroz to launch today, along wi...",Tata has given the Altroz its Impact Design 2....,"indian,carmaker,tata,motor,gearing,update,line...",-3.31
2,2020-01-24 16:34:00+00:00,Expect 'huge appetite' for Triumph bikes: Baja...,The bikes produced under the brand will be ava...,"nonethe,bike,produced,brand,available,price,st...",-10.47
3,2020-01-23 18:58:00+00:00,"Western Union, Airtel to launch real-time glob...",Airtel Payments Bank Customers can soon direct...,"western,union,bharti,airtel,limited,come,toget...",-0.08
4,2020-01-24 13:02:00+00:00,Maruti Suzuki begins export of S-Presso,"""S-Presso is a true symbol of Make in India. T...","country,largest,carmaker,maruti,suzuki,india,m...",-2.44
...,...,...,...,...,...
85,2020-02-29 15:28:00+00:00,Uday Dave | Journey from medical transcription...,"Intraday trading, that too discretionary tradi...","time,people,going,midlife,crisis,baroda,based,...",-20.39
86,2020-02-27 21:10:00+00:00,Suryoday Small Finance Bank finalises merchant...,Suryoday Small Finance Bank has kicked off the...,"mumbaiheadquartered,suryoday,small,finance,ban...",-2.35
87,2020-02-27 15:06:00+00:00,"Fiat Chrysler lines up electric cars, SUVs for...",FCA is working on battery powered vehicles at ...,"fca,india,smallest,car,maker,country,planning,...",-7.13
88,2020-02-27 18:34:00+00:00,Gross NPAs in LIC's debt portfolio touch recor...,"While net NPA stayed constant at 0.36 percent,...","nonethis,highest,gross,npa,debt,portfolio,leve...",-2.35
