In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict, defaultdict
import json  

In [None]:
tickers = sorted([ticker[9:-4] for ticker in glob.glob("data/raw/*.zip")])
daily_returns = pd.DataFrame(index=pd.date_range(start="1999-01-05", end="2021-03-01", 
                                                 freq=pd.tseries.offsets.BDay(), name="Date"))  # business dates
for ticker in tickers:
    df = pd.read_pickle("data/raw/" + ticker + ".zip")  # needs pickle5 compression (python 3.8)
    daily_returns[ticker] = df["Log Return"].fillna(method='ffill')

daily_returns

In [None]:
# GENERATES A HEATMAP OF CORRELATIONS
# but its not useful since we have 1846x1846 correlations
# just uncomment everything once to try it out

# vegetables = tickers
# farmers = tickers

# harvest = daily_returns.corr(method="pearson")

# fig, ax = plt.subplots()
# im = ax.imshow(harvest)

# # We want to show all ticks...
# # ax.set_xticks(np.arange(len(farmers)))
# # ax.set_yticks(np.arange(len(vegetables)))
# # ... and label them with the respective list entries
# # ax.set_xticklabels(farmers)
# # ax.set_yticklabels(vegetables)

# # Rotate the tick labels and set their alignment.
# plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
#          rotation_mode="anchor")

# # Loop over data dimensions and create text annotations.
# for i in range(len(vegetables)):
#     for j in range(len(farmers)):
#         text = ax.text(j, i, harvest[i, j],
#                        ha="center", va="center", color="w")

# ax.set_title("Harvest of local farmers (in tons/year)")
# fig.tight_layout()
# plt.show()

In [None]:
corr = daily_returns.corr(method="pearson")

In [None]:
th = 0.834 #highest threshold for 20 pairs

def add_value(pairs_dict, key, value):
    if key in pairs_dict.keys():
        pairs_dict[key] += [value]
    else:
        pairs_dict[key] = [value]

def get_top_pairs(useThreshold):
    pairs_dict = {}
    for ticker in tickers:
        top5 = corr.sort_values(by=[ticker])[ticker][-6:-1] # exclude itself
        assert top5.name == ticker

        # TOP 5 FOR EVERY TICKER
        if not useThreshold:
            pairs_dict[ticker] = top5.to_dict({})
            
        # JUST ADD THOSE ABOVE THRESHOLD
        else:
            if top5.max() > th:
                for tick in top5.index:
                    if top5[tick] > th:
                        add_value(pairs_dict, top5.name, {tick: top5[tick]})
                
    return pairs_dict


def top_n_pairs(n: int=5) -> Dict[str, Dict[str, float]]:
    pairs_dict: Dict[str, Dict[str, float]]=dict()
    for ticker in tickers:
        top_n = corr.sort_values(by=[ticker])[ticker][(-1 - n):-1] # exclude itself
        assert top_n.name == ticker
        pairs_dict[ticker] = top_n.to_dict({})
    
    return pairs_dict


In [None]:
top_n_pairs()

In [None]:
top5alltickers = get_top_pairs(False)
top20pairs = get_top_pairs(True)

In [None]:
# SAVES THE JSON TO THE DESIRED PATH
top5alltickersJSON = json.dumps(top5alltickers, sort_keys=False, indent=4, separators=(',', ': '))
save_path = "data/info/corellations.json"
open(save_path,"w").write(top5alltickersJSON)
print(top5alltickersJSON)

In [None]:
# SAVES THE JSON TO THE DESIRED PATH
top20pairsJSON = json.dumps(top20pairs, sort_keys=False, indent=4, separators=(',', ': '))
save_path = "data/info/top20corellations.json"
open(save_path,"w").write(top20pairsJSON)
print(top20pairsJSON)

In [None]:
from typing import Dict, Tuple
def match_tickers(corr_dict: Dict[str, Dict[str, float]], min_corr=0, debug=False):
    all_tickers: Set[str] = set()
    all_pairs: Dict[Tuple[str, str], float] = dict()
    # iterate over all pairs, and add the SORTED tuple of tickers with their correlations
    # sorting ensures (A, B) and (B, A) will not both be added
    for ticker_a, ticker_a_dict in corr_dict.items():
        for ticker_b, a_b_corr in ticker_a_dict.items():
            # for ticker pairs above the minimum corr, add to the set and dict
            if min_corr <= a_b_corr:
                all_tickers.add(ticker_a); all_tickers.add(ticker_b)
                all_pairs[tuple(sorted([ticker_a, ticker_b]))] = a_b_corr
    
    # use a modified non-optimal version of Gale-Shapley to find the matchings
    unmatched: Dict[str, bool] = {ticker: True for ticker in all_tickers}
    matches: Dict[str, str] = dict()  # use dict instead of list of sets for speed
    i = 0  # terminate at 100 iterations
    while any([unmatched_i for unmatched_i in unmatched.values()]) and i < 100:
        if debug:
            num_matched = sum([1 for unmatched_i in unmatched.values() if not unmatched_i])
            print(f"Iteration: {i}, number matched: {num_matched}")
        for (ticker_a, ticker_b), a_b_corr in all_pairs.items():
            if unmatched[ticker_a] and unmatched[ticker_b]:
                unmatched[ticker_a] = False; unmatched[ticker_b] = False
                matches[ticker_a] = ticker_b; matches[ticker_b] = ticker_a
            else:
                a_pair = tuple(sorted([ticker_a, matches.get(ticker_a, "")]))
                b_pair = tuple(sorted([ticker_b, matches.get(ticker_b, "")]))
                a_corr = all_pairs.get(a_pair, 0.0)
                b_corr = all_pairs.get(b_pair, 0.0)
                # if tickers a and b are better corellated than their current matches
                if a_corr < a_b_corr and b_corr < a_b_corr:
                    if debug: 
                        print(f"pair: {(ticker_a, ticker_b)} better than {a_pair} and {b_pair}")
                    # remove the current matches (checking if they exist)
                    ticker_a_match = matches.get(ticker_a, "")
                    ticker_b_match = matches.get(ticker_b, "")
                    if ticker_a_match != "": 
                        unmatched[ticker_a_match] = True
                        del matches[ticker_a_match]
                    if ticker_b_match != "": 
                        unmatched[ticker_b_match] = True
                        del matches[ticker_b_match]
                    
                    # match ticker_a and ticker_b
                    unmatched[ticker_a] = False; unmatched[ticker_b] = False
                    matches[ticker_a] = ticker_b; matches[ticker_b] = ticker_a
        i += 1
    
    matched_pairs: Dict[Tuple[str, str], float] = {
        tuple(sorted([ticker_a, ticker_b])): all_pairs.get(tuple(sorted([ticker_a, ticker_b])))
        for (ticker_a, ticker_b) in matches.items()
    }
    return matched_pairs

In [None]:
matched_tickers = match_tickers(top5alltickers, min_corr=0.75)
# tuple to tuple string representation for keys, save as JSON
matched_tickers_JSON = json.dumps(
    {str(k): v for k,v in matched_tickers.items()}, 
    sort_keys=False, indent=4, separators=(',', ': '))
save_path = "data/info/pairs.json"
open(save_path,"w").write(matched_tickers_JSON)
print(matched_tickers)