In [13]:
import pandas as pd
import numpy as np
import platform
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px

from utils import *

In [4]:
# Load data
if platform.system() == 'Windows':
    path = '.\data\stock_port.csv'
else:
    path = './data/stock_port.csv'
df = load_data(path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# Basic Info
print('Dimension of sparse_matrix is ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]

# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Calculate shareholding % by shareholder
# sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)
sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 0).reshape(-1, col_dim)

# Element-wise multiply two matrix
sp_matrix_stock_shareholder = sp_matrix_stock * sp_matrix_shareholder

Dimension of sparse_matrix is  (2020, 709)


In [19]:
# Apply DBSCAN
clustering = DBSCAN(eps=0.3, min_samples=10).fit(sp_matrix_stock)

In [20]:
# Visualize the result
X_embedded = TSNE(n_components = 2, perplexity = 100, learning_rate = 200).fit_transform(sp_matrix_stock)
df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index'].apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])

In [21]:
df_tsne['label'] = clustering.labels_

fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

In [None]:
# youtube:
# https://www.youtube.com/watch?v=h53WMIImUuc

# Publish paper:
# 