In [1]:
# Packages
import requests
from bs4 import BeautifulSoup
import os

In [2]:
def get_news_urls(links_site):
    '''scrape the html of the site'''
    resp = requests.get(links_site)
 
    if not resp.ok:
        return None
 
    html = resp.content
 
    '''convert html to BeautifulSoup object'''
    soup = BeautifulSoup(html , 'lxml')
 
    '''get list of all links on webpage'''
    links = soup.find_all('a')
 
    urls = [link.get('href') for link in links]
    urls = [url for url in urls if url is not None]
 
    '''Filter the list of urls to just the news articles'''
    news_urls = [url for url in urls if '/article/' in url]
 
    return news_urls

In [3]:
def scrape_news_text(news_url):
 
    news_html = requests.get(news_url).content
 
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html , 'lxml')
 
    paragraphs = [par.text for par in news_soup.find_all('p')]
    news_text = '\n'.join(paragraphs)
 
    return news_text

In [4]:
def scrape_all_articles(ticker , upper_page_limit = 5):
 
    landing_site = 'http://www.nasdaq.com/symbol/' + ticker + '/news-headlines'
 
    all_news_urls = get_news_urls(landing_site)
 
    current_urls_list = all_news_urls.copy()
 
    index = 2
 
    '''Loop through each sequential page, scraping the links from each'''
    while (current_urls_list is not None) and (current_urls_list != []) and \
        (index <= upper_page_limit):
 
        '''Construct URL for page in loop based off index'''
        current_site = landing_site + '?page=' + str(index)
        current_urls_list = get_news_urls(current_site)
 
        '''Append current webpage's list of urls to all_news_urls'''
        all_news_urls = all_news_urls + current_urls_list
 
        index = index + 1
 
    all_news_urls = list(set(all_news_urls))
 
    '''Now, we have a list of urls, we need to actually scrape the text'''
    all_articles = [scrape_news_text(news_url) for news_url in all_news_urls]
 
    return all_articles

In [5]:
def save_articles_to_files(articles, ticker):
    article_counter = 0
    for article in articles: 
        article_file = ticker + str(article_counter) + '.txt'
        file_name = os.path.join('article_data', article_file)

        f = open(file_name, 'a')
        f.write(article)
        f.close()
        article_counter += 1
        print("Processed Article Number: ", article_counter)


In [6]:
def get_processed_articles(articles):
    ad_string = "Enter up to 25 symbols separated by commas or spaces in the text box"
    intro = "Join the Nasdaq Community today and get free, instant access to portfolios, stock ratings, real-time alerts, and more!"
    processed_results = []
    for a in articles: 
        ad_string_pos = a.find(ad_string)
        intro_pos = a.find(intro)
        start_index = intro_pos + len(intro)
        end_index = ad_string_pos

        processed_results.append(a[start_index:end_index])
    
    return processed_results

In [7]:
def get_all_articles(tickers, num_pages_to_read):
    all_articles = []
    for ticker in tickers:
        raw_articles = scrape_all_articles(ticker, num_pages_to_read)
        processed_articles = get_processed_articles(raw_articles)
        all_articles += processed_articles 
    return all_articles


In [8]:
def get_company_set(article, ticker_dictionary, name_to_ticker):
    words_found = set([])
    ticker_set = set([])
    for ticker, names in ticker_dictionary.items(): 
        ticker_set = ticker_set | names
    article = article.split()
    for word in article:
        if word in ticker_set: 
            words_found.add(name_to_ticker[word])
    return words_found 
        
    

In [9]:
def update_graph(edge_graph, company_set):
    pairs = itertools.combinations(company_set, 2)

    for pair in pairs: 
        if pair in edge_graph or (pair[1], pair[0]) in edge_graph:
            processed_pair = pair if pair in edge_graph else (pair[1], pair[0])
            edge_graph[processed_pair] += 1

In [10]:
ticker_dictionary = {
    'GOOGL':set(['Google', 'Alphabet', 'GOOGL', 'googl', 'google']),
    'NFLX':set(['Netflix', 'NFLX', 'netflix', 'nflx']), 
    'MSFT':set(['MSFT', 'Microsoft', 'microsoft', 'MICROSOFT', 'msft']), 
    'AMZN':set(['AMZN', 'Amazon', 'amazon', 'amzn']), 
    'TSLA':set(['TSLA', 'TESLA', 'Tesla', 'tesla', 'tsla'])
}
name_to_ticker = {}
for k, v, in ticker_dictionary.items():
    for name in v: 
        name_to_ticker[name] = k 


In [11]:
import itertools 
edges = itertools.combinations(ticker_dictionary.keys(), 2)
edge_graph = {e:0 for e in edges}
num_pages_to_read = 10

all_articles = get_all_articles(ticker_dictionary.keys(), num_pages_to_read)

article_num = 0
for article in all_articles: 
    article_num += 1
    company_set = get_company_set(article, ticker_dictionary, name_to_ticker)
    update_graph(edge_graph, company_set)

    
print(edge_graph)

{('GOOGL', 'NFLX'): 47, ('GOOGL', 'MSFT'): 54, ('GOOGL', 'AMZN'): 69, ('GOOGL', 'TSLA'): 16, ('NFLX', 'MSFT'): 15, ('NFLX', 'AMZN'): 43, ('NFLX', 'TSLA'): 6, ('MSFT', 'AMZN'): 61, ('MSFT', 'TSLA'): 1, ('AMZN', 'TSLA'): 5}
