In [1]:
import pandas as pd
import numpy as np

from utils import *

In [2]:
# Load data 
path = '.\data\stock_port.csv'
df = load_data(path)

In [None]:
# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

In [None]:
# Basic Info
print('Dimension of sparse_matrix is ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]

In [67]:
# Calculate shareholding % by stock_code
sp_matrix / np.sum(sp_matrix, axis = 0).reshape(-1, col_dim) * 100

array([[0.24146842, 0.03759383, 0.00995955, ..., 0.00031838, 0.01668916,
        0.00159974],
       [0.24146842, 0.03759383, 0.00995955, ..., 0.00031838, 0.01668916,
        0.00159974],
       [0.24146842, 0.03759383, 0.00995955, ..., 0.00031838, 0.01668916,
        0.00159974],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [30]:
df = pd.read_csv('stock_port.csv')
df['stock_code'] = df['stock_code'].apply(lambda x: ('00000' + str(x))[-5:])

In [31]:
df.head()

Unnamed: 0,name_of_ccass_participant,stock_code,shareholding
0,THE HONGKONG AND SHANGHAI BANKING,5,2622440839
1,CHINA SECURITIES DEPOSITORY AND CLEARING,5,1864613717
2,BANK OF CHINA (HONG KONG) LTD,5,1003610032
3,CITIBANK N.A.,5,661483817
4,HANG SENG BANK LTD,5,541534277


In [32]:
df['name_of_ccass_participant'].unique()

array(['THE HONGKONG AND SHANGHAI BANKING',
       'CHINA SECURITIES DEPOSITORY AND CLEARING',
       'BANK OF CHINA (HONG KONG) LTD', 'CITIBANK N.A.',
       'HANG SENG BANK LTD', 'STANDARD CHARTERED BANK (HONG KONG) LTD',
       'HANG SENG SECURITIES LTD', 'UBS SECURITIES HONG KONG LTD',
       'BOCI SECURITIES LTD', 'JPMORGAN CHASE BANK, NATIONAL',
       'HSBC BROKING SECURITIES (HONG KONG) LTD',
       'CMB WING LUNG BANK LTD', 'THE BANK OF EAST ASIA LTD',
       'SHANGHAI COMMERCIAL BANK LTD',
       'BANK OF COMMUNICATIONS TRUSTEE LTD', 'DBS BANK (HONG KONG) LTD',
       'DEUTSCHE BANK AG', 'BNP PARIBAS SECURITIES SERVICES',
       'NANYANG COMMERCIAL BANK LTD', 'OCBC WING HANG BANK LTD',
       'UOB KAY HIAN (HONG KONG) LTD', 'EAST ASIA SECURITIES CO LTD',
       'GOLDMAN SACHS (ASIA) SECURITIES LTD', 'DAH SING SECURITIES LTD',
       'ICBC (ASIA) SECURITIES LTD', 'CHONG HING SECURITIES LTD',
       'CHINA CITIC BANK INTERNATIONAL LTD',
       'CHIYU BANKING CORPORATION LTD', '

In [33]:
# Prepare zeros matrix
row_dim = len(df['stock_code'].unique())
col_dim = len(df['name_of_ccass_participant'].unique())

print('Row dimension: ', row_dim)
print('Column dimension: ', col_dim)

sparse_matrix = np.zeros((row_dim, col_dim))


Row dimension:  2020
Column dimension:  709


In [37]:
# Prepare label to index dictionaries
# One for stock_code, one for name of ccass_participant

stock_code_ind = {stock_code:ind for ind, stock_code in enumerate(sorted(df['stock_code'].unique().tolist()))}
shareholder_ind = {shareholder:ind for ind, shareholder in enumerate(sorted(df['name_of_ccass_participant'].unique().tolist()))}

In [38]:
# apply the dict to df
df['stock_code_ind'] = df['stock_code'].apply(lambda x: stock_code_ind[x])
df['shareholder_ind'] = df['name_of_ccass_participant'].apply(lambda x: shareholder_ind[x])

In [39]:
df

Unnamed: 0,name_of_ccass_participant,stock_code,shareholding,stock_code_ind,shareholder_ind
0,THE HONGKONG AND SHANGHAI BANKING,00005,2622440839,4,600
1,CHINA SECURITIES DEPOSITORY AND CLEARING,00005,1864613717,4,106
2,BANK OF CHINA (HONG KONG) LTD,00005,1003610032,4,27
3,CITIBANK N.A.,00005,661483817,4,133
4,HANG SENG BANK LTD,00005,541534277,4,273
...,...,...,...,...,...
375745,CITIC SECURITIES BROKERAGE (HK) LTD,04612,100,2018,134
375746,BANK OF COMMUNICATIONS TRUSTEE LTD,04612,40,2018,28
375747,CMB WING LUNG BANK LTD,04612,40,2018,138
375748,BOCI SECURITIES LTD,04612,30,2018,41


In [46]:

for ind, row in df.iterrows():
    # Get index and shareholding
    stock_code_ind = row['stock_code_ind']
    shareholder_ind = row['shareholder_ind']
    shareholding = row['shareholding']
    
    # Assign to sparse matrix
    sparse_matrix[stock_code_ind, shareholder_ind] += shareholding
    

8730059064570.0