In [1]:
import os 
import itertools 
import spacy

In [2]:
def read_articles_from_disk(data_directories):
    counter = 0
    articles = []
    for directory in data_directories: 
        file_list = os.listdir(directory)
        files_to_read = [os.path.join(directory, file) for file in file_list]
        for file in files_to_read: 
            with open(file, 'r') as content_file:
                content = content_file.read()
                articles.append(content)
    return articles

In [3]:
def get_paths_from_tickers(tickers):
    root_data_dir = 'article_data'
    data_directories = [os.path.join(root_data_dir, t) for t in tickers]
    return data_directories

In [4]:
def get_articles_for_tickers(tickers):
    path_names = get_paths_from_tickers(tickers)
    articles = read_articles_from_disk(path_names)
    return set(articles)

In [5]:
def get_sp500_list():
    companies = [s for s in pd.read_csv('constituents.csv', header=0)['Symbol']]
    companies.remove('A')
    companies.remove('T')
    companies.remove('GOOG')
    return companies

In [6]:
def get_tickers_in_directory():
    return ['AAPL', 'CSCO', 'GOOGL', 'INTC', 'MCD', 'NFLX', 'TRV', 'VZ', 'AXP', 'CVX', 'GS', 'JNJ', 'MMM', 'NKE', 'UNH', 'WBA', 'BA', 'DIS', 'HD', 'JPM', 'MRK', 'PFE', 'UTX', 'WMT', 
           'CAT', 'DOW', 'IBM', 'KO', 'MSFT', 'PG', 'V', 'XOM']

In [7]:
def get_company_set(article, tickers):
    words_found = set([])
    ticker_set = set(tickers)
    article = article.split()
    for word in article:
        if word in ticker_set: 
            words_found.add(word)
    return words_found

In [8]:
def get_all_sentences(articles):
    nlp = spacy.load("en_core_web_sm")
    all_sentences = []
    counter = 0
    
    for a in articles:
        print('Counter at: ', counter)
        doc = nlp(a)
        for s in doc.sents:
            all_sentences.append(s)
        counter += 1
    return all_sentences

In [9]:
tickers = get_tickers_in_directory()

all_articles = get_articles_for_tickers(tickers)

In [10]:
counts_dict = {t:0 for t in tickers}
for a in all_articles: 
    for key in counts_dict: 
        if key in a: 
            counts_dict[key] += 1

In [11]:
rankings = sorted([(v, k) for k, v in counts_dict.items()], reverse=True)

In [13]:
rankings

[(14645, 'V'),
 (4146, 'BA'),
 (2567, 'AAPL'),
 (1620, 'JPM'),
 (1607, 'MSFT'),
 (1592, 'PG'),
 (1590, 'GS'),
 (1585, 'GOOGL'),
 (1459, 'DIS'),
 (1403, 'NFLX'),
 (1373, 'KO'),
 (1334, 'WMT'),
 (1300, 'CAT'),
 (1278, 'IBM'),
 (1203, 'DOW'),
 (1196, 'HD'),
 (1193, 'XOM'),
 (1140, 'MCD'),
 (1134, 'CVX'),
 (1114, 'INTC'),
 (1105, 'CSCO'),
 (1093, 'JNJ'),
 (1063, 'PFE'),
 (1028, 'VZ'),
 (986, 'MRK'),
 (962, 'MMM'),
 (957, 'NKE'),
 (956, 'AXP'),
 (904, 'UNH'),
 (881, 'WBA'),
 (877, 'TRV'),
 (861, 'UTX')]

In [14]:
pairs_dict = {}
tripples_dict = {}

for a in all_articles: 
    cs = get_company_set(a, tickers)
    pairs = itertools.combinations(cs, 2)
    tripples = itertools.combinations(cs, 3)
    
    for p in pairs: 
        pp = sorted(p)
        pp = tuple(pp)
        if pp not in pairs_dict:
            pairs_dict[pp] = 0
        pairs_dict[pp] += 1
    
    for t in tripples:
        tt = sorted(t)
        tt = tuple(tt)
        if tt not in tripples_dict:
            tripples_dict[tt] = 0
        tripples_dict[tt] += 1

In [18]:
rankings_pair = sorted([(v, k) for k, v in pairs_dict.items()], reverse=True)
rankings_pair

for p in rankings_pair: 
    if 'NKE' in list(p[1]):
        print(p)

(63, ('AAPL', 'NKE'))
(41, ('NKE', 'WMT'))
(40, ('MCD', 'NKE'))
(36, ('KO', 'NKE'))
(34, ('DIS', 'NKE'))
(32, ('NFLX', 'NKE'))
(32, ('GOOGL', 'NKE'))
(26, ('BA', 'NKE'))
(25, ('NKE', 'V'))
(23, ('HD', 'NKE'))
(19, ('MSFT', 'NKE'))
(18, ('INTC', 'NKE'))
(17, ('NKE', 'VZ'))
(16, ('NKE', 'WBA'))
(15, ('JPM', 'NKE'))
(14, ('IBM', 'NKE'))
(12, ('NKE', 'UNH'))
(12, ('GS', 'NKE'))
(12, ('CAT', 'NKE'))
(11, ('NKE', 'PG'))
(10, ('MMM', 'NKE'))
(9, ('JNJ', 'NKE'))
(9, ('AXP', 'NKE'))
(8, ('NKE', 'UTX'))
(8, ('NKE', 'TRV'))
(8, ('MRK', 'NKE'))
(8, ('CSCO', 'NKE'))
(7, ('CVX', 'NKE'))
(5, ('NKE', 'XOM'))
(4, ('NKE', 'PFE'))
(3, ('DOW', 'NKE'))


In [None]:
rankings_tripple = sorted([(v, k) for k, v in tripples_dict.items()], reverse=True)
rankings_tripple

In [None]:
sentences = get_all_sentences(all_articles)

In [None]:

print(len(sentences))

In [None]:
def get_financial_metric_dict(tickers, metric):
    result_dict = {}
    for t in tickers: 
        data = financial_data.get_quarterly_data(t)
        if t not in result_dict:
            if data:
                result_dict[t] = float(data[metric])
    return result_dict
import financial_data_api as fd
inspect_set = get_tickers_in_directory()
ebit_data = get_financial_metric_dict(inspect_set, 'EPS Growth')