In [None]:
# Packages
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd

In [None]:
def get_news_urls(links_site):
    '''scrape the html of the site'''
    resp = requests.get(links_site)
 
    if not resp.ok:
        return None
 
    html = resp.content
 
    '''convert html to BeautifulSoup object'''
    soup = BeautifulSoup(html , 'lxml')
 
    '''get list of all links on webpage'''
    links = soup.find_all('a')
 
    urls = [link.get('href') for link in links]
    urls = [url for url in urls if url is not None]
 
    '''Filter the list of urls to just the news articles'''
    news_urls = [url for url in urls if '/article/' in url]
 
    return news_urls

In [None]:
def scrape_news_text(news_url):
 
    news_html = requests.get(news_url).content
 
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html , 'lxml')
 
    paragraphs = [par.text for par in news_soup.find_all('p')]
    news_text = '\n'.join(paragraphs)
    print("Finished scrapping: ", news_url)
 
    return news_text

In [None]:
def scrape_all_articles(ticker , upper_page_limit = 5):
 
    landing_site = 'http://www.nasdaq.com/symbol/' + ticker + '/news-headlines'
 
    all_news_urls = get_news_urls(landing_site)
 
    current_urls_list = all_news_urls.copy()
 
    index = 2
 
    '''Loop through each sequential page, scraping the links from each'''
    while (current_urls_list is not None) and (current_urls_list != []) and \
        (index <= upper_page_limit):
 
        '''Construct URL for page in loop based off index'''
        current_site = landing_site + '?page=' + str(index)
        current_urls_list = get_news_urls(current_site)
 
        '''Append current webpage's list of urls to all_news_urls'''
        all_news_urls = all_news_urls + current_urls_list
 
        index = index + 1
        print("Done processing page: ", index)
 
    all_news_urls = list(set(all_news_urls))
 
    '''Now, we have a list of urls, we need to actually scrape the text'''
    all_articles = [scrape_news_text(news_url) for news_url in all_news_urls]
 
    return all_articles

In [None]:
def save_articles_to_files(articles, ticker):
    article_counter = 0
    for article in articles: 
        article_file = ticker + str(article_counter) + '.txt'
        file_name = os.path.join('article_data', ticker, article_file)

        f = open(file_name, 'w')
        f.write(article)
        f.close()
        article_counter += 1
        print("Processed Article Number: ", article_counter)


In [None]:
def get_processed_articles(articles):
    ad_string = "Enter up to 25 symbols separated by commas or spaces in the text box"
    intro = "Join the Nasdaq Community today and get free, instant access to portfolios, stock ratings, real-time alerts, and more!"
    processed_results = []
    for a in articles: 
        ad_string_pos = a.find(ad_string)
        intro_pos = a.find(intro)
        start_index = intro_pos + len(intro)
        end_index = ad_string_pos

        processed_results.append(a[start_index:end_index])
    
    return processed_results

In [None]:
def get_all_articles(tickers, num_pages_to_read):
    all_articles = []
    for ticker in tickers:
        raw_articles = scrape_all_articles(ticker, num_pages_to_read)
        processed_articles = get_processed_articles(raw_articles)
        all_articles += processed_articles 
        print("Finished processing articles for: ", ticker)
    return all_articles


In [None]:
def get_company_set(article, tickers):
    words_found = set([])
    ticker_set = set(tickers)
    article = article.split()
    for word in article:
        if word in ticker_set: 
            words_found.add(word)
    return words_found 
        
    

In [None]:
def update_graph(edge_graph, company_set):
    pairs = itertools.combinations(company_set, 2)

    for pair in pairs: 
        if pair in edge_graph or (pair[1], pair[0]) in edge_graph:
            processed_pair = pair if pair in edge_graph else (pair[1], pair[0])
            edge_graph[processed_pair] += 1

In [None]:
def read_articles_from_disk(data_directories):
    counter = 0
    articles = []
    for directory in data_directories: 
        file_list = os.listdir(directory)
        files_to_read = [os.path.join(directory, file) for file in file_list]
        for file in files_to_read: 
            with open(file, 'r') as content_file:
                content = content_file.read()
                articles.append(content)
    return articles
    

In [None]:
tickers = ['WMT', 'WBA', 'VZ', 'V', 'UTX', 'PG', 'DIS']
num_pages_to_read = 100

for t in tickers: 
    ticker = [t]
    all_articles = get_all_articles(ticker, num_pages_to_read)
    save_articles_to_files(all_articles, t)