# Libraries

In [1]:
import pandas as pd
import re
from htmldate import find_date
from newspaper import Article
from bs4 import BeautifulSoup
import requests
import nltk
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\minhn\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
def download_news(urls, sentiment_analysis=True):
    if sentiment_analysis:
        articles_df = pd.DataFrame({'publish_date':[], 'title': [],'body_text': [], 'url':[],
                                    'neg':[], 'neu':[], 'pos':[], 'compound':[]})
    else:
        articles_df = pd.DataFrame({'publish_date':[], 'title': [],'body_text': [], 'url':[]})  
                                
    for link in urls:
        article = Article(link)
        article.download()
        
        try:
            article.parse()
            text = article.text

        except: 
            print("I didn't get this")
            continue
        
        try:
            date = find_date(link)
        except:
            print("Failed to get date")
            continue

        tmpdic = {'publish_date': date, 'title': article.title, 'body_text': text, 'url': link}

        if sentiment_analysis:
            # Initialise sentiment analyser  
            sid = SentimentIntensityAnalyzer()
            # Get positive, negative, neutral and compound scores
            polarity = sid.polarity_scores(text)
            # Update ticker with the new entry polarity
            tmpdic.update(polarity)
        
        # tmpdic now has all keys and values needed to populate the DataFrame
        articles_df.loc[articles_df.shape[0]] = tmpdic

        return articles_df

In [3]:
def search_for_raw_urls(ticker, page):
    raw_urls = []
    for i in page:
        search_url = f"https://www.google.com/search?q=yahoo+finance+{ticker}&tbm=nws&start={i}"
        r = requests.get(search_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        atags = soup.find_all('a')
        hrefs = [link['href'] for link in atags]
        raw_urls.extend(hrefs)
        time.sleep(5)
    return raw_urls

def clean_urls(urls, exclude_list=['maps', 'policies', 'preferences', 'accounts', 'support'],
                        include_list=["https://finance.yahoo.com/news/"]):
    val = []
    for url in urls:
        if any(exc in url for exc in include_list) and not any(exc in url for exc in exclude_list):
            res = re.findall(r'(https?://\S+html)', url)[0]
            val.append(res)
    return list(set(val))

def get_news_urls(ticker, page, exclude_list=['maps', 'policies', 'preferences', 'accounts', 'support'],
                  include_list=["https://finance.yahoo.com/news/"]):
    raw_urls = search_for_raw_urls(ticker, page)
    cleaned_urls = clean_urls(raw_urls, exclude_list, include_list)
    return cleaned_urls

In [6]:
raw_urls = search_for_raw_urls(ticker="aapl", page=range(100, 500, 10))
raw_urls

['/?sa=X&ved=0ahUKEwj608_g6O2FAxVTsFYBHaHqACU4ZBA7CAI',
 '/search?q=yahoo+finance+aapl&start=100&sca_esv=782705a13c1e22f4&sca_upv=1&ie=UTF-8&tbm=nws&gbv=1&sei=sOwyZvq7DdPg2roPodWDqAI',
 '/search?q=yahoo+finance+aapl&sca_esv=782705a13c1e22f4&sca_upv=1&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwj608_g6O2FAxVTsFYBHaHqACU4ZBD8BQgFKAA',
 '/search?q=yahoo+finance+aapl&sca_esv=782705a13c1e22f4&sca_upv=1&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwj608_g6O2FAxVTsFYBHaHqACU4ZBD8BQgHKAI',
 '/search?q=yahoo+finance+aapl&sca_esv=782705a13c1e22f4&sca_upv=1&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwj608_g6O2FAxVTsFYBHaHqACU4ZBD8BQgIKAM',
 '/search?q=yahoo+finance+aapl&sca_esv=782705a13c1e22f4&sca_upv=1&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwj608_g6O2FAxVTsFYBHaHqACU4ZBD8BQgJKAQ',
 '/url?q=/search%3Fq%3Dyahoo%2Bfinance%2Baapl%26sca_esv%3D782705a13c1e22f4%26sca_upv%3D1%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwj608_g6O2FAxVTsFYBHaHqACU4Z

In [7]:
cleaned_urls = clean_urls(raw_urls)
cleaned_urls

['https://finance.yahoo.com/news/youtube-spotify-won-t-launch-220212119.html',
 'https://finance.yahoo.com/news/apple-inc-aapl-posts-record-215409563.html',
 'https://finance.yahoo.com/news/how-apple-stock-became-one-of-warren-buffetts-four-jewels-204336773.html',
 'https://finance.yahoo.com/news/apple-ends-yearlong-sales-slump-215349301.html',
 'https://finance.yahoo.com/news/apple-halts-online-sales-of-its-apple-watch-series-9-and-apple-watch-ultra-2-200021325.html',
 'https://finance.yahoo.com/news/calculating-intrinsic-value-apple-inc-130019817.html',
 'https://finance.yahoo.com/news/metas-strategy-resembles-apple-130159922.html',
 'https://finance.yahoo.com/news/explainer-why-boeing-apple-chipotle-amd-fedex-and-pinterest-are-trading-at-52-week-highs-110023297.html',
 'https://finance.yahoo.com/news/forget-apple-3-magnificent-seven-114500809.html',
 'https://finance.yahoo.com/news/apple-buy-sell-hold-131500421.html',
 'https://finance.yahoo.com/news/apple-aapl-registers-bigger-fall

In [8]:
file_name = "urls.txt"

current_urls = []

with open(file_name, "r") as file:
    # Read each line from the file and append it to the list
    for line in file:
        # Remove any trailing newline characters
        line = line.strip()
        current_urls.append(line)

cleaned_urls_new = [x for x in cleaned_urls if x not in current_urls]

with open(file_name, "a") as file:
    # Write each item in the additional content list to the file
    for item in cleaned_urls_new:
        file.write(item + "\n")

# Specify the file name
file_name_2 = "unprocessed_urls.txt"

# Open the file in write mode
with open(file_name_2, "w") as file:
    # Write each item in the list to the file
    for item in cleaned_urls_new:
        file.write(item + "\n")
        
cleaned_urls_new

['https://finance.yahoo.com/news/youtube-spotify-won-t-launch-220212119.html',
 'https://finance.yahoo.com/news/how-apple-stock-became-one-of-warren-buffetts-four-jewels-204336773.html',
 'https://finance.yahoo.com/news/apple-halts-online-sales-of-its-apple-watch-series-9-and-apple-watch-ultra-2-200021325.html',
 'https://finance.yahoo.com/news/metas-strategy-resembles-apple-130159922.html',
 'https://finance.yahoo.com/news/explainer-why-boeing-apple-chipotle-amd-fedex-and-pinterest-are-trading-at-52-week-highs-110023297.html',
 'https://finance.yahoo.com/news/forget-apple-3-magnificent-seven-114500809.html',
 'https://finance.yahoo.com/news/apple-buy-sell-hold-131500421.html',
 'https://finance.yahoo.com/news/apple-aapl-registers-bigger-fall-224519111.html',
 'https://finance.yahoo.com/news/apples-vision-pro-wont-launch-215100815.html',
 'https://finance.yahoo.com/news/apple-inc-aapl-fairly-valued-161234824.html',
 'https://finance.yahoo.com/news/heres-much-youd-invested-1000-12301280

In [9]:
news_df = download_news(cleaned_urls_new)
news_df

Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to get date
Failed to ge

Unnamed: 0,publish_date,title,body_text,url,neg,neu,pos,compound
0,2024-04-23,Apple may strike a historic deal with FIFA—and...,Apple is reportedly doubling down on live spor...,https://finance.yahoo.com/news/apple-may-strik...,0.013,0.889,0.097,0.9879


In [11]:
collected_news = pd.read_csv("output/news.csv")
collected_news

Unnamed: 0,publish_date,title,body_text,url,neg,neu,pos,compound
0,2024-01-15,Apple could have a new problem: Uncle Sam,Apple (AAPL) has long avoided the government-i...,https://finance.yahoo.com/news/apple-could-hav...,0.064,0.855,0.081,0.9568
1,2024-03-13,It’s not all doom and gloom for Apple in 2024,Apple (AAPL) is having a difficult 2024. Share...,https://finance.yahoo.com/news/its-not-all-doo...,0.02,0.841,0.139,0.9988
2,2024-03-29,Huawei Profit Surges as It Takes Share From Ap...,(Bloomberg) -- Huawei Technologies Co. sustain...,https://finance.yahoo.com/news/huawei-profit-s...,0.038,0.886,0.076,0.9135
3,2024-03-30,Forget Apple: I Think This Stock Should Replac...,"The ""Magnificent Seven"" stock club is almost a...",https://finance.yahoo.com/news/forget-apple-th...,0.048,0.814,0.138,0.998
4,2024-03-31,Apple's antitrust fight could threaten its sea...,The Department of Justice antitrust lawsuit ag...,https://finance.yahoo.com/news/apples-antitrus...,0.121,0.762,0.117,-0.9001
5,2024-03-31,Will ASML Be Worth More Than Apple by 2030?,ASML (NASDAQ: ASML) and Apple (NASDAQ: AAPL) a...,https://finance.yahoo.com/news/asml-worth-more...,0.045,0.853,0.102,0.9952
6,2024-04-01,"Top Stock Reports for Apple, NVIDIA & Eli Lilly","Monday, April 1, 2024\r\n\r\n\r\n\r\nThe Zacks...",https://finance.yahoo.com/news/top-stock-repor...,0.058,0.768,0.174,0.9992
7,2024-04-01,Apple (AAPL) Falls More Steeply Than Broader M...,The latest trading session saw Apple (AAPL) en...,https://finance.yahoo.com/news/apple-aapl-fall...,0.011,0.887,0.102,0.9934
8,2024-04-04,"Apple Inc (AAPL) CEO Timothy Cook Sells 196,41...","Apple Inc (NASDAQ:AAPL), a leading technology ...",https://finance.yahoo.com/news/apple-inc-aapl-...,0.017,0.888,0.095,0.9761
9,2024-04-09,Jim Cramer has harsh words for Tim Cook and Apple,"""Mad Money"" host Jim Cramer goes back and fort...",https://finance.yahoo.com/news/jim-cramer-hars...,0.044,0.843,0.114,0.9811


In [17]:
news_combine = pd.concat([collected_news, news_df], axis=0, ignore_index=True)
news_combine = news_combine.set_index("publish_date")
news_combine = news_combine.sort_index()
news_combine

In [20]:
news_combine.to_csv(r"output/news.csv")