# Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import seaborn as sns
import yfinance as yf
from numpy import linalg as LA
from sklearn.cluster import KMeans, AgglomerativeClustering
from statsmodels.tsa.stattools import coint
from itertools import combinations
import matplotlib.dates as mdates
from datetime import datetime
import random
import matplotlib.gridspec as gridspec
from collections import Counter
from pprint import pprint
import matplotlib.colors as mcolors
import matplotlib.collections as mcollections
from scipy.cluster.hierarchy import dendrogram, cut_tree
from ISLP.cluster import compute_linkage

In [2]:
cap_path = '/Users/tuckeringlefield/Desktop/Data_Science/Math_4920/Stocks_Data/cap_data_from_shardar.csv'
price_path = "/Users/tuckeringlefield/Desktop/Data_Science/Math_4920/Stocks_Data/price_data_from_shardar.csv"

# Reading Data:
prices_df = pd.read_csv(price_path, index_col='date')
prices_df.index = pd.to_datetime(prices_df.index)
caps_df = pd.read_csv(cap_path, index_col='date')
caps_df.index = pd.to_datetime(caps_df.index)

In [3]:
# Pulling rougly the first half of data
num_rows = len(prices_df)
print(f'Original Length: {num_rows}')
train_df = prices_df[:np.round(num_rows/2).astype(int)]
caps_df = caps_df[:np.round(num_rows/2).astype(int)]
num_rows = len(train_df)
print(f'Train Length: {num_rows}')

Original Length: 5787
Train Length: 2894


In [4]:
# Let's identify stocks with no null values
complete_stock_list = train_df.columns
non_null_stocks = []
for stock in complete_stock_list:
    # get the count of nulls
    null_count = train_df[stock].isnull().sum()
    if null_count == 0:
        non_null_stocks.append(stock)

print(len(complete_stock_list))
print(len(non_null_stocks))

10810
2467


# Sectors

In [5]:
Market_dict = {}
file_path = '/Users/tuckeringlefield/Desktop/Data_Science/Math_4920/Stocks_Data/Nasdaq_sectors.csv'
nasdaq_sectors = pd.read_csv(file_path)
cols_to_keep = ['Symbol', 'Sector']
nasdaq_sectors = nasdaq_sectors[cols_to_keep]

In [6]:
for Sector in nasdaq_sectors['Sector'].unique().tolist():
    if pd.notna(Sector):
        Market_dict[Sector] = []
        temp_df = nasdaq_sectors[nasdaq_sectors['Sector']==Sector]
        for stk in temp_df['Symbol'].unique().tolist():
            if stk in complete_stock_list:
                Market_dict[Sector].append(stk)

In [7]:
for Sector in list(Market_dict.keys()):
    print(Sector)
    print(len(Market_dict[Sector]))

Industrials
343
Finance
430
Real Estate
101
Health Care
679
Consumer Discretionary
630
Technology
414
Basic Materials
19
Consumer Staples
75
Energy
100
Miscellaneous
24
Utilities
73
Telecommunications
35


In [8]:
full_market_dict = Market_dict

# Functions

### Clustering

In [9]:
def get_corr_matrix(DataFrame):
    #print()
    return DataFrame.corr()

In [10]:
def K_mean_clustering(DataFrame, num_clusters, n_init):
    if (DataFrame.isnull().values.any()):
        print('Warning: Null/NaN values found in clustering data')
    X = get_corr_matrix(DataFrame)
    if X.isnull().values.any():
        print('Warning: Null/NaN values found in correlation data')
    #X = ((1-X)/2.)**.5 #distance matrix
    kmeans = KMeans(n_clusters=num_clusters, n_init=n_init).fit(X)
    cluster_dict = {}
    # Iterate over the indices of cluster_list
    for i in range(len(kmeans.labels_)):
        cluster_number = kmeans.labels_[i]
        stock_name = DataFrame.columns[i]
        # Check if cluster_number is already a key in the dictionary
        if cluster_number in cluster_dict:
            cluster_dict[cluster_number].append(stock_name)
        else:
            cluster_dict[cluster_number] = [stock_name] 
    return cluster_dict

In [11]:
def spectral_clustering(df, num_clusters):
    A = abs(df.corr().values)
    #A = df.corr().values
    D = np.diag(A.sum(axis=1))
    L = D - A
    eigenvalues, eigenvectors = LA.eig(L)
    X = eigenvectors[:,:num_clusters]
    kmeans = KMeans(n_clusters=num_clusters, n_init=20).fit(X)
    cluster_dict = {}
    # Iterate over the indices of cluster_list
    for i in range(len(kmeans.labels_)):
        cluster_number = kmeans.labels_[i]
        stock_name = df.columns[i]
        # Check if cluster_number is already a key in the dictionary
        if cluster_number in cluster_dict:
            cluster_dict[cluster_number].append(stock_name)
        else:
            cluster_dict[cluster_number] = [stock_name]
    return cluster_dict

In [12]:
def Hierarchical_clustering(df, num_clusters):
    X = get_corr_matrix(df)
    X = ((1-X)/2.)**.5 #distance matrix
    HC_clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage='average')
    labels = HC_clustering.fit(X).labels_
    stocks = X.columns.tolist()
    cluster_dict = {}
    for i in range(len(labels)):
        cluster = labels[i]
        member = stocks[i]
        if cluster not in cluster_dict.keys():
            cluster_dict[cluster] = []
        cluster_dict[cluster].append(member)
    return cluster_dict

### Trading

### Analysis

# Testing