#### **Download FinBERT model from Hugging Face**

In [None]:
import os

if os.path.exists(os.path.abspath('./finbert_model/pytorch_model.bin')) == False:
    os.system('git lfs install')
    os.system('git clone https://huggingface.co/ProsusAI/finbert finbert_model')

#### **Add path to access FinBERT library**

In [None]:
import sys
import os

finbert_path = os.path.abspath('./finbert')
if (finbert_path in sys.path) == False:
    sys.path.append(finbert_path)

#### **Download NLTK tokenizer**

In [None]:
import nltk
nltk.download('punkt')

#### **Import libraries**

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import datetime
from finbert.finbert import predict
from transformers import AutoModelForSequenceClassification

#### **Create data output directory**

In [None]:
output_dir = './output'
if os.path.exists(output_dir) == False:
    os.mkdir(output_dir)

#### **Access to Yahoo Finance page**  
Open Yahoo finance page using chrome driver and get main page news contents.  
News contents is wrapped by `container` class, so first get elements using this class info.  
After that, find `a` tag to find link element, and get title and link.  
In last, data will save as csv file.  

#### **TODO**  
Crawl news info from each category. Currently, this crawl main page news only.

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")

driver = webdriver.Chrome(chrome_options)
driver.get('https://finance.yahoo.com/')
driver.set_page_load_timeout(30)

elements = driver.find_elements(By.CLASS_NAME, 'container')

news_titles = []
for element in elements:
    # content_card = element.find_elements(By.CLASS_NAME, 'content')
    title_part = element.find_elements(By.TAG_NAME, 'a')
    if len(title_part) == 0:
        continue

    title = title_part[0].get_attribute('title')
    link = title_part[0].get_attribute('href')
    news_titles.append((title, link))

output_path = os.path.abspath('./output/' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.csv')
with open(output_path, '+w', encoding='utf-8') as f:
    f.write('\n'.join([index[0] for index in news_titles]))
    f.close()

print("Main page news loaded.")
print(f"{len(elements)} contents found.")

#### **Load each articles contents**

In [None]:
articles_index = []

for target_article in news_titles:
    driver.get(target_article[1])
    article_content_area = driver.find_elements(By.CLASS_NAME, 'caas-body')
    if len(article_content_area) == 0:
        continue

    article_content = []
    contents_components = article_content_area[0].find_elements(By.TAG_NAME, 'p')

    for component in contents_components:
        content = component.text
        if len(content) != 0:
            article_content.append(content)
    
    articles_index.append((target_article[0], article_content))
    print("Article parsed : " + target_article[0])

#### **Save loaded article contents as file**

In [None]:
import os

article_base_path = os.path.abspath('./output/' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '/') + '/'
os.mkdir(article_base_path)

for index in range(0, len(articles_index)):
    article = articles_index[index]
    article_path = article_base_path + str(index) + '.txt'

    with open(article_path, 'w', encoding='utf-8') as f:
        f.write(article[0] + '\n\n')
        f.write('\n'.join([index for index in article[1]]))
        f.close()

#### **Predict positive/negative using FinBERT model**

In [None]:
model_path = os.path.abspath('./finbert_model')
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, cache_dir=None)

output_path = os.path.abspath('./output/' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '_output/') + '/'
os.mkdir(output_path)
for index in range(0, len(articles_index)):
    article = articles_index[index]
    output_filepath = output_path + str(index) + '.csv'

    print(article[1])
    predict('\n'.join(article[1]), model, write_to_csv=True, path=output_filepath)