In [None]:
import os
import pandas as pd
import itertools
import networkx as nx
import financial_data_api as fd
import numpy as np
import matplotlib.pyplot as plt 

financial_data = fd.FinancialData()

In [None]:
def read_articles_from_disk(data_directories):
    counter = 0
    articles = []
    for directory in data_directories: 
        file_list = os.listdir(directory)
        files_to_read = [os.path.join(directory, file) for file in file_list]
        for file in files_to_read: 
            with open(file, 'r') as content_file:
                content = content_file.read()
                articles.append(content)
    return articles

In [None]:
def get_paths_from_tickers(tickers):
    root_data_dir = 'article_data'
    data_directories = [os.path.join(root_data_dir, t) for t in tickers]
    return data_directories

In [None]:
def get_articles_for_tickers(tickers):
    path_names = get_paths_from_tickers(tickers)
    articles = read_articles_from_disk(path_names)
    return set(articles)

In [None]:
def get_sp500_list():
    companies = [s for s in pd.read_csv('constituents.csv', header=0)['Symbol']]
    companies.remove('A')
    companies.remove('T')
    companies.remove('GOOG')
    return companies

In [None]:
def get_company_set(article, tickers):
    words_found = set([])
    ticker_set = set(tickers)
    article = article.split()
    for word in article:
        if word in ticker_set: 
            words_found.add(word)
    return words_found

def get_occurrence_dict(articles):
    counts_dict = {}
    article_counter = 0
    for a in all_articles:
        company_set = get_company_set(a, sp_500_list)
        pairs = itertools.combinations(company_set, 2)
        pairs = set([tuple(sorted(list(p))) for p in pairs])
        for p in pairs: 
            if p not in counts_dict:
                counts_dict[p] = 0
            counts_dict[p] += 1
    return counts_dict   

In [None]:
def get_k_closest_neighbors(company_graph, node_source, num_neighbors):
    neighbors = nx.shortest_path_length(company_graph, source=node_source, weight='weight')
    nearest = []
    counter = 0
    for n in neighbors:
        nearest.append(n)
        if counter == num_neighbors:
            break
        counter += 1
    return nearest[1:]

In [None]:
def get_financial_metric_dict(tickers, metric):
    result_dict = {}
    for t in tickers: 
        data = financial_data.get_quarterly_data(t)
        if t not in result_dict:
            if data:
                result_dict[t] = float(data[metric])
    return result_dict

In [None]:
tickers = ['AAPL']
tickers = ['BA', 'AAPL', 'GOOGL', 'CVX', 'MSFT', 'NFLX', 'XOM', 'GS', 'CAT', 'MMM', 'KO', 'DOW', 'HD', 'CSCO', 'AXP', 'TRV', 'MRK', 'UNH', 'PFE', 'NKE', 'MCD', 'JPM', 'JNJ', 'INTC', 'IBM']

all_articles = get_articles_for_tickers(tickers)
sp_500_list = get_sp500_list()

In [None]:
co_occurrence_dict = get_occurrence_dict(all_articles)
node_set = set([])
for pair in co_occurrence_dict:
    node_set.add(pair[0])
    node_set.add(pair[1])

In [None]:
co_occurrence_graph = nx.Graph()
co_occurrence_graph.add_nodes_from(list(node_set))

max_score = max([v for v in co_occurrence_dict.values()])

for pair, score in co_occurrence_dict.items():
    company_A = pair[0]
    company_B = pair[1]
    co_occurrence_graph.add_edge(company_A, company_B, weight=(1 - (score/max_score)))

In [None]:
k = 10

In [None]:
ebit_data = get_financial_metric_dict(node_set, 'EPS Growth')

In [None]:
results = []
n_dict = {}
for node in co_occurrence_graph.nodes():
    neighbors = get_k_closest_neighbors(co_occurrence_graph, node, k)
    neighbor_stats = []
    n_dict[node] = neighbors

    
    
    for n in neighbors: 
        if n in ebit_data:
            neighbor_stats.append(ebit_data[n])
    deviation = np.std(neighbor_stats)
    results.append((node, deviation))

In [None]:
results

In [None]:
n_dict['VZ']

In [None]:
inspect_set = ['MMM', 'AXP', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DOW', 'XOM', 'GS', 'HD', 'IBM', 'INTC', 'JNJ', 'JBM',
'MCD', 'MRK', 'MSFT', 'NKE', 'PFE', 'PG', 'TRV', 'UNH', 'UTX', 'VZ', 'V', 'WMT', 'WBA', 'DIS']


top = sorted([r for r in results if r[0] in inspect_set], key=lambda x:x[1])

labels, ys = zip(*top)
xs = np.arange(len(labels)) 
width = 1

plt.bar(xs, ys, width, align='center')

plt.xticks(xs, labels, rotation='vertical') #Replace default x-ticks with xs, then replace xs with labels
#plt.yticks(ys)

plt.savefig('netscore.png')


In [None]:
sorted(top, key=lambda x:x[0])