 ### The idea of this notebook is to make an analysis of the correlation between companies, make a graph-like visualisation and store the result data. The whole process will allow to make a free choice of companies list, so you can insert your own list of tickers and follow this notebook in order to make visualisation and store .csv results.

### Steps: 
- download tickers of companies
- combine them in a single dataframe
- calculate correlation between companies
- convert table of correlation into pairs of data

#### Import all necessary libraries

In [None]:
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from alpha_vantage.timeseries import TimeSeries
import csv
import time
key = '7WKARC4DTBTJVW54'

In [None]:
def remove_spaces(text_list):
    func = lambda x: x != ''
    return list(filter(func, text_list))

#### Parse wikipedia S&P page to get the latest index information

In [None]:
def get_sp500_data():
    # get general information about the components of S&P 500 index
    sp500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    sp500_requested = requests.get(sp500_url).text
    
    sp500_soup = BeautifulSoup(sp500_requested, 'lxml')
    sp500_table = sp500_soup.find('table', {'class':'wikitable sortable'})
    sp500_tr = sp500_table.findAll('tr')
    
    # create csv file with following header
    columns = remove_spaces(sp500_tr[0].text.split('\n'))
    
    with open('data/sp500.csv', 'a', encoding='utf-8') as fp:
        # header of csv file
        writer = csv.writer(fp)
        writer.writerow(columns)  
        for row in sp500_tr[1:]:
            text_list = row.text.split('\n')
            info = remove_spaces(text_list)
            writer.writerow(info)
    print('Done!')
    time.sleep(13)
get_sp500_data()    

#### Get list of SP tickers


In [None]:
def get_tickers_list(tickers):
    with open('sp500.csv', 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp)
        for row in reader:
            if row!=[]:
                tickers.append(row[0])
    return tickers[1:]

tickers = []            
tickers = get_tickers_list(tickers)       

In [None]:
tickers

#### Download quaotes 

In [None]:
def get_daily_quotes(key, tickers, start = None, end = None, save = True):
    if not isinstance(tickers, (list, pd.Series)):
        tickers = pd.Series(tickers)
    for ticker in tickers:
        print(ticker)
#         try:
        ts = TimeSeries(key, output_format='pandas')
        core, meta = ts.get_daily(symbol=ticker, outputsize='full')
        if start != None:
            if end != None:
                core = core[start:end]
            else:
                core = core[start:]         
        core.to_csv('data/historical_daily_quotes/{}.csv'.format(ticker))
        print(ticker + ' historical quotes were downloaded!')
        time.sleep(14)
#         except:
#             print(ticker + '  not found!')  
get_daily_quotes(key,tickers)            

In [None]:
tickers

#### Combine files to get one dataframe

In [None]:
# date,1. open,2. high,3. low,4. close,5. volume
def compile_data(tickers):
    main_df = pd.DataFrame()
    for count, ticker in enumerate(tickers):
        try:
            ticker = ticker.split('.')[0]
            print(ticker)
            df = pd.read_csv('data/historical_daily_quotes1/{}.csv'.format(ticker))
            df.set_index('date', inplace=True)

            df.rename(columns={'2. high': ticker}, inplace=True)
            df.drop(['1. open', '3. low', '4. close', '5. volume'], 1, inplace=True)

            if main_df.empty:
                main_df = df
            else:
                main_df = main_df.join(df, how='outer')

            if count % 5 == 0:
                print(count)
                
        except:
            print('1')
    print(main_df.head())
    main_df.to_csv('sp500_joined.csv')    
    
compile_data(tickers)        

In [None]:
main_df = pd.read_csv('sp500_joined.csv')
main_df.tail()

In [None]:
main_df = main_df.iloc[4500:]
main_df.reset_index(inplace=True)
main_df.drop('index',axis=1, inplace=True)

In [None]:
main_df

#### calculate matrix of correlation

In [None]:
df = main_df
df.set_index('date', inplace=True) 
df_corr = df.pct_change().corr()
df_corr.head()

In [None]:
df_corr.index = a
df_corr.columns =a

In [None]:
a = []
names = sp['Symbol'].tolist()
for i, ticker in enumerate(df_corr.index):
    if ticker in names:
        index = names.index(ticker)
        a.append(index)

#### extract pairs of correlated companies

In [None]:
s = df_corr.unstack()
so = s.sort_values(kind="quicksort")

In [None]:
so

In [None]:
so = so[so<1]
so = so[abs(so)>0.5]
so = so.iloc[::2]

In [None]:
edges = pd.DataFrame(so)
edges['Source'] = None
edges['Target'] = None
edges['Weight'] = None
edges['Index'] = range(so.shape[0])

In [None]:
for i,x in enumerate(so.index):
    edges['Source'][i] = x[0]
    edges['Target'][i] = x[1]
    edges['Weight'][i] = so[i]  
edges = edges.drop(0, 1)  
edges.set_index('Index',inplace=True)
edges.to_csv('edges1.csv')

In [None]:
edges

In [None]:
sp = pd.read_csv('C:\\Users\\rtut6\\Desktop\\123\\correlation\\sp500.csv')
sp.drop(columns =['SEC filings','Headquarters Location', 'CIK', 'Founded','Date first added'], axis=1,inplace=True)
sp.to_csv('nodes.csv')