# PROG 8245 - Machine Learning Programming
# Final Assignment Part 1: Scrapping the Training Data

## Part 0: Environment Set Up

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

import requests
from bs4 import BeautifulSoup

import time
import numpy as np
import pandas as pd

# from ProjectFunctions import *

### Functions

In [2]:
def get_article_links(search_term, limit = 100):

    driver = webdriver.Firefox()
    
    url_path = f'https://www.investing.com/search/?q={search_term}&tab=news'
    driver.get(url_path)

    links = []

    pos = 0
    while len(driver.find_elements(by=By.CLASS_NAME, value="articleItem")) < limit:
        pos += 500
        driver.execute_script(f'window.scrollTo(0, {pos})')

    article_items = driver.find_elements(by=By.CLASS_NAME, value="articleItem")  
    for article in article_items:
        if article.get_attribute("class") != "js-article-item articleItem     ": 
            link = article.find_element(by=By.CLASS_NAME, value="title")         
            links.append(link.get_attribute("href"))             
            
    driver.quit()
    return links

def get_article_links_mult_stocks(list_of_stocks):
    list_of_links = []
    stock_names = []
    for stock in list_of_stocks:
        stock_names.append(stock)
        links = get_article_links(stock)
        stock_names += [stock] * len(links)
        list_of_links += links
    return [list_of_links, stock_names]

def get_article_text(link):

    response = requests.get(link)

    if response.status_code == 200:

        soup = BeautifulSoup(response.text, 'html.parser')
        article = soup.find('div', class_='WYSIWYG articlePage')
        date = soup.find("div", class_ = "contentSectionDetails")

        # Extracting article text
        script = article.find("script")
        if script != None:
            script.extract()
        img_carousel = article.find('div', class_='imgCarousel')
        if img_carousel != None:
            img_carousel.extract()
        related_instruments_wrapper = article.find('div', class_='relatedInstrumentsWrapper')
        if related_instruments_wrapper != None:
            related_instruments_wrapper.extract()
        paragraph = article.find("p")
        if paragraph != None:
            em = paragraph.find("em")
            if em != None:
                paragraph.extract()
        text_inside_div = article.get_text()
        
        # Extracting article date
        cont_sect_det = soup.find_all("div", class_ = "contentSectionDetails")
        date = cont_sect_det[1].find("span").text
        
        return [text_inside_div.strip(), date[10:-11]]
    
    else:
        return [None, None]
    
def get_article_texts(links, tracker = False):
    results = []
    for i, link in enumerate(links):
        results.append(get_article_text(link))
        if tracker:
            print(i + 1, "/", len(links))
    return results

def get_training_links(pages):

    driver = webdriver.Firefox()
    links = []

    for page in range(1, pages + 1):

        url_path = f'https://www.investing.com/news/stock-market-news/{page}'
        driver.get(url_path)

        left_column = driver.find_element(by=By.ID, value="leftColumn")
        article_items = left_column.find_elements(by = By.CLASS_NAME, value = "articleItem")
        for article in article_items:
            try:
                link = article.find_element(by=By.CLASS_NAME, value="title")
                links.append(link.get_attribute("href"))
            except:
                continue
            
    driver.quit()
    return links

### Part 1: Gathering the Training Data

First we will collect a bunch of article links by scrapping the first 100 pages of https://www.investing.com/news/stock-market-news/{page}, where {page} will be every number between 1 and 100.

In [3]:
training_links = get_training_links(100)
print("Number of Articles:", len(training_links))

Number of Articles: 1718


Using the list of links, we will now go into each link and get the article text and dates.

In [4]:
training_text = get_article_texts(training_links, tracker=True)
training_text = np.array(training_text)
training_text.shape

1 / 1718
2 / 1718
3 / 1718
4 / 1718
5 / 1718
6 / 1718
7 / 1718
8 / 1718
9 / 1718
10 / 1718
11 / 1718
12 / 1718
13 / 1718
14 / 1718
15 / 1718
16 / 1718
17 / 1718
18 / 1718
19 / 1718
20 / 1718
21 / 1718
22 / 1718
23 / 1718
24 / 1718
25 / 1718
26 / 1718
27 / 1718
28 / 1718
29 / 1718
30 / 1718
31 / 1718
32 / 1718
33 / 1718
34 / 1718
35 / 1718
36 / 1718
37 / 1718
38 / 1718
39 / 1718
40 / 1718
41 / 1718
42 / 1718
43 / 1718
44 / 1718
45 / 1718
46 / 1718
47 / 1718
48 / 1718
49 / 1718
50 / 1718
51 / 1718
52 / 1718
53 / 1718
54 / 1718
55 / 1718
56 / 1718
57 / 1718
58 / 1718
59 / 1718
60 / 1718
61 / 1718
62 / 1718
63 / 1718
64 / 1718
65 / 1718
66 / 1718
67 / 1718
68 / 1718
69 / 1718
70 / 1718
71 / 1718
72 / 1718
73 / 1718
74 / 1718
75 / 1718
76 / 1718
77 / 1718
78 / 1718
79 / 1718
80 / 1718
81 / 1718
82 / 1718
83 / 1718
84 / 1718
85 / 1718
86 / 1718
87 / 1718
88 / 1718
89 / 1718
90 / 1718
91 / 1718
92 / 1718
93 / 1718
94 / 1718
95 / 1718
96 / 1718
97 / 1718
98 / 1718
99 / 1718
100 / 1718
101 / 17

(1718, 2)

### Part 2: Storing data as a Dataframe and Saving

In [10]:
text_data_dict = {
    "Link": training_links,
    "Date": training_text[:,1],
    "Text": training_text[:,0]
}

text_data = pd.DataFrame(text_data_dict)
display(text_data.head())
text_data.to_json("trainingArticles.json")

Unnamed: 0,Link,Date,Text
0,https://www.investing.com/news/stock-market-ne...,"Dec 04, 2023","Company OverviewEnphase Energy, Inc. (NASDAQ:E..."
1,https://www.investing.com/news/stock-market-ne...,"Dec 04, 2023",SolarEdge Technologies (NASDAQ:SEDG) Inc. has ...
2,https://www.investing.com/news/stock-market-ne...,"Dec 04, 2023","In the dynamic world of technology, Apple Inc ..."
3,https://www.investing.com/news/stock-market-ne...,"Dec 10, 2023",By Anirban Sen (Reuters) - U.S. health insurer...
4,https://www.investing.com/news/stock-market-ne...,"Dec 04, 2023",In the dynamic landscape of software and digit...


At this point, we will save the data and stop the notebook here.  This is so that in case something goes wrong in the next steps (which will be in a new notebook), we can start over by loading in the saved data instead of regenerating it by running all of the code above.