In [1]:
import os
import pandas as pd
import numpy as np
from communities.algorithms import louvain_method
import time
import yfinance as yf
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.linear_model import LinearRegression
import threading

# get correlation matrix file names
data_folder = "Data/Corr_Mat"
output_folder = "Data/Ticker_List"

## Download Data

In [2]:
def download_stock_pool_data():
    # get data directory path
    cur_path = os.path.dirname(__file__)
    data_directory_path = os.path.relpath('../Data', cur_path)

    # get latest russell 1000 constituents
    russell1000_info = pd.read_excel(io=data_directory_path + "\\Russell_1000_Constituents_20221007.xlsx",
                                     sheet_name="Holdings", skiprows=range(7))
    # get list of tickers
    stocks_pool_list = list(russell1000_info.Ticker.values)
    # get historical market data of current Russel 1000 constituents
    stocks_pool_data = yf.Tickers(stocks_pool_list).history(start="2012-01-01")["Close"]

    # get historical market data of S&P500
    sp500_data = pd.DataFrame(yf.Ticker("^GSPC").history(start="2012-01-01")["Close"])
    # rename S&P 500 data column
    sp500_data.columns = ["SP500"]

    # merge two dataframes
    raw_data = stocks_pool_data.join(sp500_data)

    # drop stocks with more than 1000 NaNs
    raw_data = raw_data.dropna(axis="columns", thresh=2500)

    # save stock pool data into csv
    raw_data.to_csv(data_directory_path + "\\Raw_Data_20221007.csv")

# # get stock pool data from yahoo finance
# download_stock_pool_data()

## Generate Correlation Matrices

In [3]:
def generate_residual_matrices():
    # get data directory path
    cur_path = os.path.dirname(__file__)
    data_directory_path = os.path.relpath('../Data', cur_path)
    # read data from file
    raw_data = pd.read_csv(data_directory_path + "\\Raw_Data_20221007.csv", index_col=0)

    # set rebalancing frequency: every month
    rebalance_freq_period = relativedelta(months=1)
    # set business day convention for rebalancing
    business_day_convention = "Modified Following"
    # training set length
    train_set_length_period = relativedelta(months=6)
    # set date range
    first_date = datetime.strptime(raw_data.index[0], "%Y-%m-%d")
    last_date = datetime.strptime(raw_data.index[-1], "%Y-%m-%d")

    # initialize date range
    train_start_date = first_date
    train_end_date = train_start_date + train_set_length_period - relativedelta(days=1)

    test_start_date = train_end_date + relativedelta(days=1)
    test_end_date = test_start_date + rebalance_freq_period

    # traverse the data set
    while test_end_date < last_date:
        # do regression
        temp_train_data = raw_data.loc[train_start_date.__str__()[:10]:test_start_date.__str__()[:10], :]
        # save the residuals
        temp_residuals = pd.DataFrame(index=temp_train_data.index, columns=temp_train_data.columns.drop("SP500"))
        for ticker in temp_residuals.columns:
            # get data
            y_x = temp_train_data[[ticker, "SP500"]]
            # drop nas
            y_x = y_x.dropna(axis="index", how="any")
            # rename columns
            y_x.columns = ["y", "x"]
            # whether there's sufficient trading days
            if len(y_x.index) < len(temp_train_data) * 0.9:
                continue
            else:
                # calculate returns
                y_x = np.log(y_x).diff().dropna(axis="index", how="any")
                y = np.array(y_x["y"])
                x = np.array(y_x["x"]).reshape(-1, 1)
                # do regression
                reg = LinearRegression(fit_intercept=True).fit(x, y)
                # calculate residual
                y_x.loc[:, "res"] = np.subtract(y, (reg.intercept_ - reg.coef_[0] * x)[:, 0])
                # add residual to temp_residuals
                temp_residuals.loc[y_x.index, ticker] = y_x.res
        # drop nans in dataframe
        temp_residuals = temp_residuals.dropna(axis="index", how="all")
        temp_residuals = temp_residuals.dropna(axis="columns", how="any")
        # calculate correlations
        temp_residuals = temp_residuals.astype(float)
        temp_corr_matrix = temp_residuals.corr()

        # save the matrix to file
        temp_corr_matrix.to_csv(data_directory_path + "\\Corr_Mat\\" + train_start_date.__str__()[:10] + ".csv")

        # update dates
        train_start_date += rebalance_freq_period
        test_start_date += rebalance_freq_period
        test_end_date += rebalance_freq_period

# # calculate correlation between residuals of stocks during 6M period
# generate_residual_matrices()

## Split Graph Into Communities

In [4]:
def get_communities(file_name:str, input_folder:str = data_folder):
    print(file_name)
    # get correlation matrix
    corr_mat = pd.read_csv(input_folder+"/"+file_name, index_col=0)
    # convert to adjacency matrix
    adj_mat = np.abs(corr_mat - np.diag(np.diag(corr_mat)))
    # using Louvain method to split the graph into 20 communities
    communities, _ = louvain_method(adj_mat.values, 20)
    # get ticker list of each community
    cluster_list = []
    for community in communities:
        cluster = list(community)
        cluster_list.append(adj_mat.columns[cluster])
    # save the list to txt
    global output_folder
    with open(output_folder + "/"+file_name.replace(".csv",".txt"), 'w') as f:
        for cluster in cluster_list:
            f.write(str(list(cluster)) + "\n")

In [5]:
class MyThread(threading.Thread):

    def __init__(self, func, arg):
        super().__init__()
        self.func = func
        self.arg = arg

    def run(self):
        self.func(*self.arg)

In [6]:
def get_all_communities(file_list:list[str] = os.listdir(data_folder), input_folder:str = data_folder, thread_num:int = 8):
    # create 8 threads
    for i in range(0, len(file_list), thread_num):
        thread_list = []
        print("Iteration {} start, current time = {}".format(i//8+1, datetime.now()))
        for j in range(thread_num):
            file_name = file_list[i + j]
            thread = MyThread(get_communities, (file_name, input_folder))
            thread.start()
            thread_list.append(thread)
        for thread in thread_list:
            thread.join()

# generate clusters of tickers and save them to txt file
file_name_list = []
for file in os.listdir(data_folder):
    if file > "2013-11-03.csv":
        file_name_list.append(file)

# file_name_list = os.listdir(data_folder)
get_all_communities(file_name_list, data_folder, 8)

Iteration 1 start, current time = 2022-10-14 03:25:55.570050
2013-12-03.csv
2014-01-03.csv
2014-02-03.csv
2014-03-03.csv
2014-04-03.csv
2014-05-03.csv
2014-06-03.csv
2014-07-03.csv
Iteration 2 start, current time = 2022-10-14 05:34:21.051339
2014-08-03.csv2014-09-03.csv

2014-10-03.csv
2014-11-03.csv
2014-12-03.csv
2015-01-03.csv
2015-02-03.csv
2015-03-03.csv
Iteration 3 start, current time = 2022-10-14 07:44:36.677298
2015-04-03.csv
2015-05-03.csv
2015-06-03.csv
2015-07-03.csv
2015-08-03.csv
2015-09-03.csv
2015-10-03.csv
2015-11-03.csv
Iteration 4 start, current time = 2022-10-14 10:00:09.037116
2015-12-03.csv
2016-01-03.csv
2016-02-03.csv2016-03-03.csv

2016-04-03.csv
2016-05-03.csv2016-06-03.csv

2016-07-03.csv
Iteration 5 start, current time = 2022-10-14 12:44:14.068682
2016-08-03.csv
2016-09-03.csv
2016-10-03.csv
2016-11-03.csv
2016-12-03.csv
2017-01-03.csv
2017-02-03.csv
2017-03-03.csv
Iteration 6 start, current time = 2022-10-14 15:33:07.077394
2017-04-03.csv
2017-05-03.csv
2017

IndexError: list index out of range

## Get Centroids of Clusters