# Web scraping stock market news for Sentiment Analysis

## 1. Introduction

Stock market news articles from 2014-2022 will be collected by dynamic web scraping from [Investing.com](https://uk.investing.com/equities/astrazeneca-news) using a combination of Selenium library to automate browser interaction enabling data extraction by Beautiful Soup.




## 2. Install/import libraries

In [None]:
!pip install htmldate
!pip install twython
!pip3 install newspaper3k

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting htmldate
  Downloading htmldate-1.3.2-py3-none-any.whl (39 kB)
Collecting dateparser>=1.1.1
  Downloading dateparser-1.1.3-py2.py3-none-any.whl (292 kB)
[K     |████████████████████████████████| 292 kB 8.0 MB/s 
[?25hCollecting urllib3<2,>=1.26
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 41.0 MB/s 
Collecting regex!=2019.02.19,!=2021.8.27,<2022.3.15
  Downloading regex-2022.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 35.0 MB/s 
Installing collected packages: regex, urllib3, dateparser, htmldate
  Attempting uninstall: regex
    Found existing installation: regex 2022.6.2
    Uninstalling regex-2022.6.2:
      Successfully uninstalled regex-2022.6.2
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Unin

In [None]:
import pandas as pd
import numpy as np
import time
import twython
import requests
import nltk
import warnings
warnings.filterwarnings('ignore')

from htmldate import find_date
from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')
from newspaper import Article

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


## 3. Data collection



In [None]:
# Set up Selenium

!pip install selenium
!apt-get update 
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selenium
  Downloading selenium-4.6.0-py3-none-any.whl (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 4.7 MB/s 
Collecting urllib3[socks]~=1.26
  Using cached urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
[K     |████████████████████████████████| 384 kB 48.6 MB/s 
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting sniffio
  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.0.1-py3-none-any.whl (12 kB)
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
 

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Conn                                                                               Hit:4 https://developer.download.nvidi

In [None]:
def get_newslinks(company, page_number):
    """For a given URL, scroll to relevant section to load appropriate HTML into driver,
    iterate through all articles on page and append article URLs to a list

    :param company: name of company to scrape articles for
    :param page_number: page number on news website to iterate over 

    :return: list of articles URLs
    """
    
    url = f"https://uk.investing.com/equities/{company}-news/{page_number}"
    driver.get(url)

    href = []

    # scroll all the way to the bottom 

    old_position = 0
    new_position = None

    while new_position != old_position:
        # Get old scroll position
        old_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                " window.pageYOffset : (document.documentElement ||"
                " document.body.parentNode || document.body);"))
        # Sleep and Scroll
        time.sleep(1)
        driver.execute_script((
                "var scrollingElement = (document.scrollingElement ||"
                " document.body);scrollingElement.scrollTop ="
                " scrollingElement.scrollHeight;"))
        # Get new position
        new_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                " window.pageYOffset : (document.documentElement ||"
                " document.body.parentNode || document.body);"))
        
    cleaned_links = []

    # Iterate through all the articles on the page
    for article_number in range(1,11): 
        article = driver.find_element(By.XPATH,f'/html/body/div[5]/section/div[8]/article[{article_number}]')
        article_html = article.get_attribute('innerHTML')
        soup = BeautifulSoup(article_html, "lxml")
        for link in soup.find_all('a'): 
            # Get the href
            partial_link = link.get('href')
            if 'https' in partial_link: 
                cleaned_links.append(partial_link)
            # Some links are 'internal' to the page and don't have https in them. The web page must be appended to these links
            elif partial_link[0] == '/': 
                cleaned_links.append('https://uk.investing.com/'+partial_link) 

    return np.unique(cleaned_links)

In [None]:
all_company_urls = []
for page in range(1,30):
    results = get_newslinks('astrazeneca', page)
    all_company_urls.extend(results)
all_company_urls

['https://invst.ly/zbgw-',
 'https://invst.ly/zbmyz',
 'https://invst.ly/zcazg',
 'https://invst.ly/zdy3j',
 'https://invst.ly/zdyt-',
 'https://invst.ly/zdzdb',
 'https://invst.ly/zfln4',
 'https://uk.investing.com//news/stock-market-news/astrazeneca-shares-will-get-lift-from-heartburn-drug-litigation-closure-says-citi-2805533',
 'https://uk.investing.com//news/stock-market-news/astrazeneca-touts-positive-data-from-two-experimental-drugs-in-breast-cancer-settings-2797029',
 'https://uk.investing.com//news/stock-market-news/astrazenecas-beyfortus-receives-eu-approval-2811313',
 'https://uk.investing.com//news/coronavirus/astrazeneca-says-evusheldapproved-for-treatment-of-covid19-in-eu-2755721',
 'https://uk.investing.com//news/stock-market-news/astrazeneca-gets-its-third-regulatory-green-light-in-a-matter-of-weeks-for-asthma-drug-2762527',
 'https://uk.investing.com//news/stock-market-news/astrazeneca-pays-660-premium-for-gene-therapy-firm-logicbio-2769321',
 'https://uk.investing.com/

In [None]:
# AstraZeneca stock ticker
ticker = 'AZN.L'
# Create a DataFrame to populate while iterating
article_sentiments = pd.DataFrame({'ticker':[],
                                'publish_date':[],
                                'title': [],
                                'body_text': [],
                                'url':[],
                                'neg':[],
                                'neu':[], 
                                'pos':[], 
                                'compound':[]})
# Loop over all the articles
for link in all_company_urls:
      article = Article(link)
      article.download()
      
      try:
          article.parse()
          text = article.text

      except: 
          print("I didn't get this")
          continue

      # Initialise sentiment analyser    
      sid = SentimentIntensityAnalyzer()
      # Get positive, negative, neutral and compound scores
      polarity = sid.polarity_scores(text)

      tmpdic = {'ticker': ticker, 'publish_date': NaN, 'title': article.title, 'body_text': article.text, 'url': link}
      # Update ticker with the new entry polarity
      tmpdic.update(polarity)
      # tmpdic now has all keys and values needed to populate the DataFrame
      article_sentiments= article_sentiments.append(pd.DataFrame(tmpdic, index=[0]))
      article_sentiments.reset_index(drop=True, inplace=True)

In [None]:
# Show DataFrame of article sentiments data

article_sentiments

In [None]:
# Save DataFrame 

article_sentiments.to_pickle("azn_article_sentiments.pkl")

In [None]:
article_sentiments.to_csv("azn_article_sentiments.csv", sep=',', encoding='utf-8', header=True)

In [None]:
# Pasting all the body text to a text file
with open('/azn_body_text_2022.txt', 'w') as f:
  for link in all_company_urls:
      article = Article(link)
      article.download()
      try:
          article.parse()
          text = article.text
      except: 
          print("I didn't get this")
          continue
      f.write(article.text)   