####The first cell is configuration for running web scraping on Google Colab.



In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver


####Install required modules

In [None]:
!pip install selenium
!pip install transformers
!pip install pandas-datareader
!pip install yfinance
!pip install backtrader
!pip install pdflib

####Import statements

In [None]:
#General
import pandas as pd
import numpy as np
import requests
import os
from glob import glob
from tqdm import tqdm

#Web Scraping and Price Data
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
import time
import yfinance as yf

#Sentiment Analysis
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pdflib import Document

#BackTrader
import backtrader as bt
import backtrader.feeds as btfeeds
import matplotlib.pyplot as plt
import datetime

####Scrape announcement dates, links and metadata

In [None]:
def get_data(company_symbol,start_date, end_date):

    #Setup dates for yahoo finance
    start_date_for_yf = start_date[6:] + "-" + start_date[3:5] +
                        "-"+ start_date[:start_date.index("-")]
    end_date_for_yf = end_date[6:] + "-" + end_date[3:5] +
                      "-"+ end_date[:end_date.index("-")]

    #Setup scraping options
    chrome_options = Options()
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    chrome_options.add_argument('user-agent={0}'.format(user_agent))
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--window-size=1920,1080')

    service = Service("/usr/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    #Start Scraping
    month_map = {"01":"Jan","02":"Feb","03":"Mar","04":"Apr","05":"May",
                "06":"Jun","07":"Jul","08":"Aug","09":"Sep","10":"Oct",
                "11":"Nov","12":"Dec"}

    driver.get("https://www.bseindia.com/corporates/ann.html")
    time.sleep(3)
    driver.find_element(By.XPATH,"//input[@id='scripsearchtxtbx']").send_keys(company_symbol)
    time.sleep(5)
    driver.find_element(By.XPATH,"//ul[@id='ulSearchQuote2']//li[1]").click()
    driver.find_element(By.XPATH,"//input[@id='txtFromDt']").click()
    select = Select(driver.find_element(By.CLASS_NAME,'ui-datepicker-year'))
    select.select_by_visible_text(start_date[6:])
    select = Select(driver.find_element(By.CLASS_NAME,'ui-datepicker-month'))
    select.select_by_visible_text(month_map[start_date[3:5]])
    if start_date[0] == "0":
        start_date = start_date[1:]
    start_day_string = start_date[:start_date.index("-")]
    driver.find_element(By.XPATH,"//table[@class='ui-datepicker-calendar']//a[contains(text(),{})]".format(start_day_string)).click()

    driver.find_element(By.XPATH,"//input[@id='txtToDt']").click()
    select = Select(driver.find_element(By.CLASS_NAME,'ui-datepicker-year'))
    select.select_by_visible_text(end_date[6:])
    select = Select(driver.find_element(By.CLASS_NAME,'ui-datepicker-month'))
    select.select_by_visible_text(month_map[end_date[3:5]])
    if end_date[0] == "0":
        end_date = end_date[1:]
    end_day_string = end_date[:end_date.index("-")]
    driver.find_element(By.XPATH,"//table[@class='ui-datepicker-calendar']//a[contains(text(),{})]".format(end_day_string)).click()

    driver.find_element(By.XPATH,"//input[@value='Submit']").click()
    time.sleep(5)

    #Parse Announcements
    num_of_ann = 0
    more_pages = True

    data = {"Date":[],"Time":[],"Title":[],"Category":[],"PDF":[]}

    print("Company: ",company_symbol)
    print("Fetching Data...")
    page = 1

    stopwords = ["Buy Back",
                 "Newspaper",
                 "Details of Loss of",
                 "Postal Ballot",
                 "Allotment",
                 "Trading Window"]

    while(more_pages):

        print("Fetching Page ",page)
        table = driver.find_elements(By.XPATH,
                                     "//td[@id='lblann']//tbody//tr[4]//td//table")

        for row in table:
            reject = False
            pdf_exists = True
            texts = row.find_elements(By.CLASS_NAME,"ng-binding")

            if len(texts) < 6:
                continue

            for word in stopwords:
              if word in texts[0].text:
                reject = True
                break

            if reject:
              continue

            try:
                pdf_link = row.find_element(By.CLASS_NAME,"tablebluelink")
            except:
                pdf_exists = False
                pass


            data["Title"].append(texts[0].text)
            data["Category"].append(texts[1].text)
            data["Date"].append(texts[3].text[0:10])
            data["Time"].append(texts[3].text[11:])

            if pdf_exists:
                data["PDF"].append(pdf_link.get_attribute('href'))
            else:
                data["PDF"].append("NA")

            num_of_ann += 1

        if len(driver.find_elements(By.XPATH,"//a[@ng-click='fn_NextPage()']")) > 0:
            driver.find_element(By.XPATH,"//a[@ng-click='fn_NextPage()']").click()
            page += 1
            time.sleep(5)
        else:
            more_pages = False

    driver.quit()

    final_data = pd.DataFrame.from_dict(data)
    final_data.to_csv("pdflinks/{}_{}.csv".format(company_symbol,num_of_ann),
                      index=False)

    data = yf.download(company_symbol+".BO",
                       start=start_date_for_yf,
                       end=end_date_for_yf)
    data.reset_index(inplace=True)
    data["positive"] = 0
    data["negative"] = 0
    data["neutral"] = 0
    data['Date'] = data['Date'].dt.strftime('%d-%m-%Y')
    data.to_csv("pricedata/{}.csv".format(company_symbol))
    print("=======================================================")

In [None]:
try:
    os.mkdir("pricedata")
    os.mkdir("pdflinks")
except FileExistsError:
    os.rmdir("pricedata")
    os.rmdir("pdflinks")
    os.mkdir("pricedata")
    os.mkdir("pdflinks")

symbols = list(pd.read_csv("symbols.csv",header=None)[0])
start_date = "01-01-2022"
end_date = "30-12-2022"

for symbol in symbols:
  get_data(symbol,start_date,end_date)

####Downloading PDF files from links

In [None]:
try:
  os.mkdir("/content/drive/MyDrive/pdfs")
except:
  pass

for link_file in glob("/content/selected_stocks/*.csv"):
  data = pd.read_csv(link_file)
  pdf_list = np.array(data["PDF"])
  date_list = np.array(data["Date"])

  symbol = link_file[25:link_file.index(".")]
  symbol = symbol[:symbol.index("_")]

  print("Downloading files for {}".format(symbol))
  try:
    os.mkdir("/content/drive/MyDrive/pdfs/{}".format(symbol))
  except:
    pass

  temp = 0
  headers = {'User-agent': 'Mozilla/5.0'}
  for i in tqdm(range(len(pdf_list))):

    r = requests.get(pdf_list[i], allow_redirects=True, stream=True, headers=headers)

    if os.path.exists("/content/drive/MyDrive/pdfs/{}/{}.pdf".format(symbol,date_list[i])):
      temp += 1
      with open('/content/drive/MyDrive/pdfs/{}/{}_{}.pdf'.format(symbol,date_list[i],temp),'wb') as pdf:
        pdf.write(r.content)
    else:
      temp = 0
      with open('/content/drive/MyDrive/pdfs/{}/{}.pdf'.format(symbol,date_list[i]),'wb') as pdf:
        pdf.write(r.content)

####Initialize FinBERT Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to('cuda')

####Sentiment Analysis Code

In [None]:
for price_file in glob("/content/prices/*.csv"):
  count = 1
  price_data = pd.read_csv(price_file)
  symbol = price_file[16:price_file.index(".")]
  print("Running Analysis for {}".format(symbol))

  total = len(glob("/content/drive/MyDrive/pdfs/{}/*.pdf".format(symbol)))

  for pdf in glob("/content/drive/MyDrive/pdfs/{}/*.pdf".format(symbol)):

    pages = Document(pdf)

    try:
     pages = Document(pdf)
    except:
     continue

    if "_" in pdf:
      date = pdf[-16:-6]
    else:
      date = pdf[-14:-4]

    if len(price_data.loc[price_data.Date == date]) == 0: #Non-trading Day
      continue

    positive = []
    negative = []
    neutral = []

    if not isinstance(pages, list):
      pages = list(pages)

    for page in tqdm(list(pages),desc="{}/{}| {}".format(count,total,date),position=0, leave=True):

      text=' \n'.join(page.lines).strip()
      sentences = text.split(". ")

      for sentence in sentences:

        if any(x in sentence for x in ["Safe Harbor", "Closure","Forward-Looking"]):
          break

        sentence = sentence.strip()
        sentence = sentence.replace('\n',' ')
        sentence = sentence+"."
        inputs = tokenizer(sentence, padding = True, truncation = True, return_tensors='pt').to('cuda')
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

        positive.append(round(predictions.tolist()[0][0],2))
        negative.append(round(predictions.tolist()[0][1],2))
        neutral.append(round(predictions.tolist()[0][2],2))

    if len(positive) > 0:
      if int(price_data.loc[price_data.Date == date,"positive"]) != 0:
        price_data.loc[price_data.Date == date,"positive"] = np.mean([int(price_data.loc[price_data.Date == date,"positive"]),np.mean(positive)])
        price_data.loc[price_data.Date == date,"negative"] = np.mean([int(price_data.loc[price_data.Date == date,"negative"]),np.mean(negative)])
        price_data.loc[price_data.Date == date,"neutral"] = np.mean([int(price_data.loc[price_data.Date == date,"neutral"]),np.mean(neutral)])
      else:
        price_data.loc[price_data.Date == date,"positive"] = np.mean(positive)
        price_data.loc[price_data.Date == date,"negative"] = np.mean(negative)
        price_data.loc[price_data.Date == date,"neutral"] = np.mean(neutral)

    count+= 1

  price_data.Date=pd.to_datetime(price_data['Date'],dayfirst=True)
  price_data.set_index("Date",inplace=True)
  price_data.drop(price_data.columns[0],axis = 1,inplace=True)
  price_data.sort_index(inplace=True)
  price_data.head()
  price_data.to_csv("{}_finalData.csv".format(symbol))
  print("=========================================================================================================================")


####Strategy Development and Backtesting

In [None]:
lines = ('positive', 'negative', 'neutral')
params = (('positive', -1),
          ('negative', -1),
          ('neutral', -1)
         )

datafields = btfeeds.PandasData.datafields + (
    ['positive', 'negative', 'neutral'])

mydict = dict(
    lines=tuple(lines),
    params=params,
    datafields=bt.feeds.PandasData.datafields + list(lines),
    )

SentimentData = type('SentimentData', (btfeeds.PandasData,), mydict)

#=============================================================================================================

class SentimentStrategy(bt.Strategy):

    params = (
        ('holding_period', 0),
        ('positive_threshold', 0),
        ('exectype','Market')
    )

    def log(self, txt, dt=None):
        dt = dt or self.datas[0].datetime.date(0)
        print('%s, %s' % (dt.isoformat(), txt))

    def __init__(self):
        self.dataclose = self.datas[0].close
        self.order = None
        self.size = 0

    def notify_order(self, order):
        if order.status in [order.Submitted, order.Accepted]:
            return

        if order.status in [order.Completed]:

            if order.isbuy():
                self.log('BUY EXECUTED, %.2f , Cash Remaining, %0.2f'
                         % (order.executed.price,self.broker.get_cash()))

            elif order.issell():
                self.log('SELL EXECUTED, %.2f , Cash Remaining, %0.2f'
                         % (order.executed.price,self.broker.get_cash()))

            self.bar_executed = len(self)

        elif order.status in [order.Canceled, order.Margin, order.Rejected]:
            self.log('Order Cancelled/Margin/Rejected')

        self.order = None

    def next(self):

        self.log('Close, %.2f' % self.dataclose[0])

        if self.order:
            return

        if not self.position:
            if self.data.positive[0] >= self.params.positive_threshold:
                self.size = int(self.broker.get_cash() / self.data)
                self.log('BUY CREATE, %.2f, Size, %.2f' % (self.dataclose[0], self.size))
                self.order = self.buy(size=self.size)
        else:
            if len(self) >= (self.bar_executed + self.params.holding_period):
                self.log('SELL CREATE, %.2f' % self.dataclose[0])
                self.order = self.sell(size=self.size)
#=============================================================================================================

class BuyAndHold(bt.Strategy):

    def __init__(self):
        self.dataclose = self.datas[0].close
        self.order = None
        self.size = 0

    def start(self):
        self.val_start = self.broker.get_cash()

    def nextstart(self):
        self.order_target_value(target=self.broker.get_cash())

    def stop(self):
        self.roi = (self.broker.get_value() - self.val_start) / (self.val_start)
        print('ROI: {:.2f}%'.format(100.0 * self.roi))
#=============================================================================================================
def runstrat(symbol,holding_periods, thresholds,price_data):

    results = {}


    os.makedirs("results/{}".format(symbol))

    cerebro = bt.Cerebro(stdstats=False)
    cerebro.addobserver(bt.observers.BuySell, barplot = True,bardist = 0.018)
    cerebro.addobserver(bt.observers.Value)
    cerebro.broker.setcash(100000.0)
    cerebro.addstrategy(BuyAndHold)
    data = SentimentData(dataname=price_data)
    cerebro.adddata(data)
    starting_value = cerebro.broker.getvalue()
    cerebro.run()
    ending_value = cerebro.broker.getvalue()
    bnh_roi = (ending_value-starting_value)/starting_value

    plt.rcParams["figure.figsize"] = (13,7)
    figure = cerebro.plot(ytight=False,
                 linevalues = False,
                 loc='black',
                 subtxtsize = 6,
                 grid=True,
                 style = 'line',
                 iplot=True,
                 volume=False,
                 start=datetime.date(2022, 1, 1),
                 end=datetime.date(2022, 12, 31),
                plotmargin = 0.05)[0][0]
    figure.savefig("results/{}/BNH.jpg".format(symbol))

    for period in holding_periods:
        results[period] = {}
        for thresh in thresholds:

            cerebro = bt.Cerebro(stdstats=False)
            cerebro.addobserver(bt.observers.BuySell, barplot = True,bardist = 0.018)
            cerebro.addobserver(bt.observers.Value)

            cerebro.broker.setcash(100000)


            cerebro.addstrategy(SentimentStrategy,holding_period = period, positive_threshold=thresh)

            data = SentimentData(dataname=price_data)

            cerebro.adddata(data)

            print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
            starting_value = cerebro.broker.getvalue()
            cerebro.run()
            ending_value = cerebro.broker.getvalue()
            roi = (ending_value-starting_value)/starting_value
            print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
            results[period][thresh] = roi

            plt.rcParams["figure.figsize"] = (13,7)
            figure = cerebro.plot(ytight=False,
                         linevalues = False,
                         loc='black',
                         subtxtsize = 6,
                         grid=True,
                         style = 'line',
                         iplot=True,
                         volume=False,
                         start=datetime.date(2022, 1, 1),
                         end=datetime.date(2022, 12, 31),
                        plotmargin = 0.05)[0][0]
            figure.savefig("results_new2/{}/{}_{}_{}.jpg".format(symbol,symbol,period,thresh))

            print("========================================================")

    return results,bnh_roi
#=============================================================================================================
def make_table_2(results_dict, bnh_roi):
    periods = list(results_dict.keys()) + ["Buy_and_Hold"]
    thresholds = list(next(iter(results_dict.values())).keys())
    final_res_table = pd.DataFrame(index=thresholds, columns=periods)
    final_res_table.loc[thresholds[0], "Buy_and_Hold"] = bnh_roi
    for period, values in results_dict.items():
        final_res_table[period] = pd.Series(values)
    return final_res_table
#====================================================================================================
if __name__ == '__main__':
    symbols = list(pd.read_csv("symbols.csv",header=None)[0])

    for symbol in symbols:
        price_data=pd.read_csv("final_data/{}_finalData.csv".format(symbol))
        price_data.Date=pd.to_datetime(price_data['Date'],dayfirst=True)
        price_data.set_index("Date",inplace=True)
        price_data.sort_index(inplace=True)

        holding_periods = [5,10,20,30,90,180]
        thresholds = [0.1,0.2,0.3,0.4,0.5]

        results,bnh_roi = runstrat(symbol,holding_periods, thresholds, price_data)
        final_results = make_table_2(results,bnh_roi)
        final_results.to_csv("results/{}/{}_final_results.csv".format(symbol,symbol))
        print(final_results)