# Libraries

In [1]:
import pandas as pd
import re
from htmldate import find_date
from newspaper import Article
from bs4 import BeautifulSoup
import requests
import nltk
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\minhn\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Implementation

In [2]:
def download_news(urls, sentiment_analysis=True):
    if sentiment_analysis:
        articles_df = pd.DataFrame({'publish_date':[], 'title': [],'body_text': [], 'url':[],
                                    'neg':[], 'neu':[], 'pos':[], 'compound':[]})
    else:
        articles_df = pd.DataFrame({'publish_date':[], 'title': [],'body_text': [], 'url':[]})  
                                
    for link in urls:
        article = Article(link)
        article.download()
        
        try:
            article.parse()
            text = article.text

        except: 
            print("I didn't get this")
            with open("output/news_collection/failed_getcontent.txt", "a") as file:
                file.write(link + "\n")
            continue
        
        try:
            date = find_date(link)
        except:
            print("Failed to get date")
            with open("output/news_collection/failed_getdate.txt", "a") as file:
                file.write(link + "\n")
            continue

        with open("output/news_collection/successful.txt", "a") as file:
            file.write(link + "\n")

        tmpdic = {'publish_date': date, 'title': article.title, 'body_text': text, 'url': link}

        if sentiment_analysis:
            # Initialise sentiment analyser  
            sid = SentimentIntensityAnalyzer()
            # Get positive, negative, neutral and compound scores
            polarity = sid.polarity_scores(text)
            # Update ticker with the new entry polarity
            tmpdic.update(polarity)
        
        # tmpdic now has all keys and values needed to populate the DataFrame
        articles_df.loc[articles_df.shape[0]] = tmpdic

    return articles_df

In [3]:
def search_for_raw_urls(ticker, page):
    raw_urls = []
    for i in page:
        search_url = f"https://www.google.com/search?q=yahoo+finance+{ticker}&tbm=nws&start={i}"
        r = requests.get(search_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        atags = soup.find_all('a')
        hrefs = [link['href'] for link in atags]
        raw_urls.extend(hrefs)
        time.sleep(5)
    return raw_urls

def clean_urls(urls, exclude_list=['maps', 'policies', 'preferences', 'accounts', 'support'],
                        include_list=["https://finance.yahoo.com/news/"]):
    val = []
    for url in urls:
        if any(exc in url for exc in include_list) and not any(exc in url for exc in exclude_list):
            res = re.findall(r'(https?://\S+html)', url)[0]
            res = res.replace("finance.yahoo.com", "www.yahoo.com")
            val.append(res)
    return list(set(val))

def get_news_urls(ticker, page, exclude_list=['maps', 'policies', 'preferences', 'accounts', 'support'],
                  include_list=["https://finance.yahoo.com/news/", "https://www.yahoo.com/news/"]):
    raw_urls = search_for_raw_urls(ticker, page)
    cleaned_urls = clean_urls(raw_urls, exclude_list, include_list)
    return cleaned_urls

# Scrapping News

In [59]:
run = True
raw_urls = None
if run:
    raw_urls = search_for_raw_urls(ticker="aapl", page=range(0, 0, 0))
raw_urls

['/?sa=X&ved=0ahUKEwiguOy41e6FAxUzk68BHSnfDEA4wQUQOwgC',
 '/search?q=yahoo+finance+aapl&start=705&sca_esv=25cd74cf61df00be&ie=UTF-8&tbm=nws&gbv=1&sei=qF4zZqDEELOmvr0Pqb6zgAQ',
 '/search?q=yahoo+finance+aapl&sca_esv=25cd74cf61df00be&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiguOy41e6FAxUzk68BHSnfDEA4wQUQ_AUIBSgA',
 '/search?q=yahoo+finance+aapl&sca_esv=25cd74cf61df00be&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiguOy41e6FAxUzk68BHSnfDEA4wQUQ_AUIBygC',
 '/search?q=yahoo+finance+aapl&sca_esv=25cd74cf61df00be&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwiguOy41e6FAxUzk68BHSnfDEA4wQUQ_AUICCgD',
 '/search?q=yahoo+finance+aapl&sca_esv=25cd74cf61df00be&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiguOy41e6FAxUzk68BHSnfDEA4wQUQ_AUICSgE',
 '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2Baapl%26sca_esv%3D25cd74cf61df00be%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiguOy41e6FAxUzk68BHSnfDEA4wQUQiaAMCAooBQ&usg=AOvVaw3P2kvuiOr5MwDbXWIoQjyG',
 '/advance

In [60]:
run = True
cleaned_urls = None
if run:
    cleaned_urls = clean_urls(raw_urls)
cleaned_urls

[]

In [61]:
run = True
cleaned_urls_new = None
if run: 
    file_name = "output/news_collection/urls.txt"

    current_urls = []

    with open(file_name, "r") as file:
        # Read each line from the file and append it to the list
        for line in file:
            # Remove any trailing newline characters
            line = line.strip()
            current_urls.append(line)

    cleaned_urls_new = [x for x in cleaned_urls if x not in current_urls]

    with open(file_name, "a") as file:
        # Write each item in the additional content list to the file
        for item in cleaned_urls_new:
            file.write(item + "\n")

    # Specify the file name
    file_name_2 = "output/news_collection/unprocessed_urls.txt"

    # Open the file in write mode
    with open(file_name_2, "w") as file:
        # Write each item in the list to the file
        for item in cleaned_urls_new:
            file.write(item + "\n")
            
cleaned_urls_new

[]

In [62]:
run = True
news_df = None
if run:
    news_df = download_news(cleaned_urls_new)
news_df

Unnamed: 0,publish_date,title,body_text,url,neg,neu,pos,compound


In [63]:
collected_news = pd.read_csv("output/news.csv")
collected_news

Unnamed: 0,publish_date,title,body_text,url,neg,neu,pos,compound
0,2017-09-13,What really happened with Apple’s Face ID 'fai...,"Craig Federighi, Apple’s senior vice president...",https://www.yahoo.com/news/really-happened-app...,0.084,0.845,0.071,-0.9725
1,2017-11-17,Warren Buffett Bought Apple Inc. Stock. Why Ha...,Investing guru Warren Buffett has always been ...,https://www.yahoo.com/news/warren-buffett-buyi...,0.043,0.844,0.113,0.9944
2,2018-03-10,Read a job application from Steve Jobs from 3 ...,RR Auction/Getty\n\nA job application from an ...,https://www.yahoo.com/news/read-job-applicatio...,0.033,0.913,0.053,0.7140
3,2018-05-01,Calculating The Intrinsic Value Of Apple Inc (...,How far off is Apple Inc (NASDAQ:AAPL) from it...,https://www.yahoo.com/news/calculating-intrins...,0.013,0.812,0.175,0.9980
4,2018-08-02,Apple co-founder Steve Wozniak reacts to $1 tr...,Apple Inc. (AAPL) became the first publicly-tr...,https://www.yahoo.com/news/apple-co-founder-st...,0.017,0.872,0.111,0.9898
...,...,...,...,...,...,...,...,...
792,2024-05-01,Will Apple Beat on Q2 Earnings Amid iPhone Slu...,"All eyes are on technology giant Apple AAPL, w...",https://www.yahoo.com/news/apple-beat-q2-earni...,0.022,0.859,0.119,0.9988
793,2024-05-01,Apple to report Q2 earnings amid iPhone slowdo...,Apple (AAPL) will report its second quarter ea...,https://www.yahoo.com/news/apple-to-report-q2-...,0.017,0.921,0.061,0.9732
794,2024-05-02,Google’s payments to Apple reached $20 billion...,(Bloomberg) — Alphabet Inc. paid Apple Inc. $2...,https://www.yahoo.com/news/google-payments-app...,0.032,0.911,0.057,0.8443
795,2024-05-02,Google’s payments to Apple reached $20 billion...,(Bloomberg) — Alphabet Inc. paid Apple Inc. $2...,https://www.yahoo.com/news/google-payments-app...,0.032,0.911,0.057,0.8443


In [64]:
news_combine = pd.concat([collected_news, news_df], axis=0, ignore_index=True)
news_combine["publish_date"] = pd.to_datetime(news_combine["publish_date"])
news_combine = news_combine.set_index("publish_date")
news_combine = news_combine.sort_index()
news_combine

Unnamed: 0_level_0,title,body_text,url,neg,neu,pos,compound
publish_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-09-13,What really happened with Apple’s Face ID 'fai...,"Craig Federighi, Apple’s senior vice president...",https://www.yahoo.com/news/really-happened-app...,0.084,0.845,0.071,-0.9725
2017-11-17,Warren Buffett Bought Apple Inc. Stock. Why Ha...,Investing guru Warren Buffett has always been ...,https://www.yahoo.com/news/warren-buffett-buyi...,0.043,0.844,0.113,0.9944
2018-03-10,Read a job application from Steve Jobs from 3 ...,RR Auction/Getty\n\nA job application from an ...,https://www.yahoo.com/news/read-job-applicatio...,0.033,0.913,0.053,0.7140
2018-05-01,Calculating The Intrinsic Value Of Apple Inc (...,How far off is Apple Inc (NASDAQ:AAPL) from it...,https://www.yahoo.com/news/calculating-intrins...,0.013,0.812,0.175,0.9980
2018-08-02,Apple co-founder Steve Wozniak reacts to $1 tr...,Apple Inc. (AAPL) became the first publicly-tr...,https://www.yahoo.com/news/apple-co-founder-st...,0.017,0.872,0.111,0.9898
...,...,...,...,...,...,...,...
2024-05-01,Will Apple Beat on Q2 Earnings Amid iPhone Slu...,"All eyes are on technology giant Apple AAPL, w...",https://www.yahoo.com/news/apple-beat-q2-earni...,0.022,0.859,0.119,0.9988
2024-05-01,Apple to report Q2 earnings amid iPhone slowdo...,Apple (AAPL) will report its second quarter ea...,https://www.yahoo.com/news/apple-to-report-q2-...,0.017,0.921,0.061,0.9732
2024-05-02,Google’s payments to Apple reached $20 billion...,(Bloomberg) — Alphabet Inc. paid Apple Inc. $2...,https://www.yahoo.com/news/google-payments-app...,0.032,0.911,0.057,0.8443
2024-05-02,Google’s payments to Apple reached $20 billion...,(Bloomberg) — Alphabet Inc. paid Apple Inc. $2...,https://www.yahoo.com/news/google-payments-app...,0.032,0.911,0.057,0.8443


In [58]:
save = False
if save:
    news_combine.to_csv(r"output/news.csv")