In [1]:
import pandas as pd
import numpy as np

def convert_data_sparse_matrix(df, row_label = 'stock_code', col_label = 'name_of_ccass_participant', value_label = 'shareholding'):
    """
        Pivot table
    """
    try:
        # Prepare zero matrix
        row_dim = len(df[row_label].unique())
        col_dim = len(df[col_label].unique())
        sparse_matrix = np.zeros((row_dim, col_dim))

        # Prepare label to index dictionaries
        row_ind_dict = {label: ind for ind, label in enumerate(sorted(df[row_label].unique().tolist()))}
        col_ind_dict = {label: ind for ind, label in enumerate(sorted(df[col_label].unique().tolist()))}

        # Transform row_label column and col_label column to index
        df['row_ind'] = df[row_label].apply(lambda x: row_ind_dict[x])
        df['col_ind'] = df[col_label].apply(lambda x: col_ind_dict[x])

        for ind, row in df.iterrows():
            # Get index and shareholding
            row_ind = row['row_ind']
            col_ind = row['col_ind']
            value = row[value_label]
            
            # Assign to sparse matrix
            sparse_matrix[row_ind, col_ind] += value

        return sparse_matrix, row_ind_dict, col_ind_dict
    except Exception as e:
        print(e)
        return None

def load_data(data_path):

    # Read csv files
    df = pd.read_csv(data_path)

    # Convert stock code to formatted string
    df['stock_code'] = df['stock_code'].apply(lambda x: ('00000' + str(x))[-5:])

    return df

def f_score(y_truth, y_pred, beta = 1):
    
    try:
        # Run confusion_matrix
        tn, fp, fn, tp = confusion_matrix(y_truth, y_pred).ravel()
        
        precision_value = precision(tp, fp)
        recall_value = recall(tp, fn)
        # print recall
        print('True positive: {}, True Negative: {}, False Positive: {}, False Negative: {}'.format(tp, tn, fp, fn))
        print('Precision is ', format(precision_value * 100, '.2f'), '%')
        print('Recall is ', format(recall_value * 100, '.2f'), '%')
        
        return (1 + beta**2) * (precision_value * recall_value) / ((beta**2 * precision_value + recall_value))
    except Exception as e:
        print(e)
        return None

def precision(tp, fp):
    return tp / (tp + fp)

def recall(tp, fn):
    return tp / (tp + fn)

def get_truth_label(path, threshold = 0.3):
    # Load dataset
    df = pd.read_csv(path)

    # preprocess the data in order to get a proper data structure
    df = df.set_index('Unnamed: 0').transpose().dropna()
    df = df.reset_index()
    df['index'] = df['index'].apply(lambda x: retrieve_stock_code(x))
    df = df.set_index('index')

    # Define col_dim and empty dataframe
    col_dim = len(df.columns)
    temp = pd.DataFrame()

    # Create a list of column name without the first element
    first_dim = df.columns[0]
    col_list = df.columns.to_list()
    col_list.remove(first_dim)

    for col in col_list:
        # Assign the col to second_dim, as current date
        second_dim = col

        # Calculate the daily % change of stock price
        temp[col] = (df[second_dim] - df[first_dim]) / df[first_dim]

        # Assign the col to first dim, as previous date
        first_dim = col

    result = np.sum(temp > threshold, axis = 1)

    return {stock_code:1 if count > 0 else 0 for stock_code, count in result.items()}

def retrieve_stock_code(x):
    d = re.search('[0-9]*', x)
    if d:
        return ('00000' + d.group(0))[-5:]
    else:
        return None

In [3]:

import sys
import platform
import pandas as pd

# Define data_path
if platform.system() == 'Windows':
    data_path = '.\data\stock_port.csv'
else:
    data_path = '/Users/CliffordMan/Downloads/project/stock_port.csv'

def main():

    # load data
    df = load_data(data_path)

    # Convert df to sparse_matrix
    sparse_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

    print('Total shareholding: ', sparse_matrix.sum())

if __name__ == '__main__':
    main()

Total shareholding:  4416219264060.0


In [3]:
import pandas as pd
import numpy as np
import platform
from sklearn.manifold import TSNE


# Load data
if platform.system() == 'Windows':
    path = '.\data\stock_port.csv'
else:
    path = './data/stock_port.csv'
df = load_data(path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# Basic Info
print('Dimension of sparse_matrix is ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]

# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Calculate shareholding % by shareholder
# sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)
sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 0).reshape(-1, col_dim)

# Element-wise multiply two matrix
sp_matrix_stock_shareholder = sp_matrix_stock * sp_matrix_shareholder

sp_matrix
row_ind_dict
col_ind_dict
# Apply TSNE to sp_matrix_stock_shareholder
X_embedded = TSNE(n_components = 2, perplexity = 1000, learning_rate = 20000).fit_transform(sp_matrix_stock)
# X_embedded = TSNE(n_components = 2, perplexity = 100, learning_rate = 200).fit_transform(sp_matrix_shareholder)
# X_embedded = TSNE(n_components = 2, perplexity = 100, learning_rate = 200).fit_transform(sp_matrix_stock_shareholder)
X_embedded

df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index'].apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])

import plotly.express as px

fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code')
fig.show()

# TSNE
# skit-learn: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# PAPER https://towardsdatascience.com/t-distributed-stochastic-neighbor-embedding-t-sne-bb60ff109561


Dimension of sparse_matrix is  (2020, 709)


KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
import platform
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px

# Load data
if platform.system() == 'Windows':
    path = '.\data\stock_port.csv'
else:
    path = './data/stock_port.csv'
df = load_data(path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# Basic Info
print('Dimension of sparse_matrix is ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]

# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Calculate shareholding % by shareholder
# sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)
sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 0).reshape(-1, col_dim)

# Element-wise multiply two matrix
sp_matrix_stock_shareholder = sp_matrix_stock * sp_matrix_shareholder

# Apply DBSCAN

# Apply DBSCAN
clustering = DBSCAN(eps=0.5, min_samples=1000).fit(sp_matrix_stock)
clustering.labels_

# Visualize the result
X_embedded = TSNE(n_components = 2, perplexity = 100, learning_rate = 200).fit_transform(sp_matrix_stock)
df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index'].apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])
df_tsne['label'] = clustering.labels_

fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

# Predictive Algorithm
def cluster_predict(label, min_pts = 'auto'):
    """
        Input: an array of clsutered label for each instance
        return: an array of anomal label for each instance
    """
    try:
        # Get Unqiue label and its counts
        (unique, counts) = np.unique(label, return_counts = True)
        print('Unique Labels: ', unique)
        print('Count of Unique Labels', counts)
    
        # Define minimum points that it should have in a cluster, if auto, it will take the min count
        if min_pts == 'auto':
            min_pts = min(counts)
            print('Minimum points of a cluster among the clusters: ', min_pts)
        else:
            min_pts = int(min_pts)

        # Prepare label_dict for mapping
        label_dict = {label: 0 if count > min_pts else 1 for label, count in zip(unique, counts)}

        # Map label_dict to label
        return np.array([label_dict[i] for i in label])
    except Exception as e:
        print(e)
        return None

cluster_predict(clustering.labels_)

# youtube:
# https://www.youtube.com/watch?v=h53WMIImUuc

# Publish paper:
# Original Paper
# https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf

Dimension of sparse_matrix is  (2020, 709)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


Unique Labels:  [-1  0]
Count of Unique Labels [ 340 1680]
Minimum points of a cluster among the clusters:  340


array([0, 0, 0, ..., 0, 0, 0])

In [12]:
import pandas as pd
import numpy as np
import platform
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
import plotly.express as px



# Load data
if platform.system() == 'Windows':
    path = '.\data\stock_port.csv'
else:
    path = '/Users/CliffordMan/Downloads/project/stock_port.csv'
df = load_data(path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# Basic Info
print('Dimension of sparse_matrix is ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]

# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Calculate shareholding % by shareholder
# sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)
sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 0).reshape(-1, col_dim)

# Element-wise multiply two matrix
sp_matrix_stock_shareholder = sp_matrix_stock * sp_matrix_shareholder

clf = IsolationForest(n_estimators=150, random_state=0).fit(sp_matrix_stock)
label = clf.predict(sp_matrix_stock)

# Visualize the result
X_embedded = TSNE(n_components = 2, perplexity = 100, learning_rate = 200).fit_transform(sp_matrix_stock)
df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index'].apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])

df_tsne['label'] = label
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

# Sklearn documentation
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

# Original Paper
# https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/tkdd11.pdf

Dimension of sparse_matrix is  (2020, 709)


In [None]:
import pandas as pd
import numpy as np
import platform
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import plotly.express as px



# Load data
if platform.system() == 'Windows':
    path = '.\data\stock_port.csv'
else:
    path = '/Users/CliffordMan/Downloads/project/stock_port.csv'
df = load_data(path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# Basic Info
print('Dimension of sparse_matrix is ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]

# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Calculate shareholding % by shareholder
# sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)
sp_matrix_shareholder = sp_matrix / np.sum(sp_matrix, axis = 0).reshape(-1, col_dim)

# Element-wise multiply two matrix
sp_matrix_stock_shareholder = sp_matrix_stock * sp_matrix_shareholder

# Apply K-Mean
kmeans = KMeans(n_clusters=2, random_state=0).fit(sp_matrix_stock)

kmeans.labels_

# Visualize the result
X_embedded = TSNE(n_components = 2, perplexity = 100, learning_rate = 200).fit_transform(sp_matrix_stock)
df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index'].apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])

df_tsne['label'] = kmeans.labels_
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

# Predictive Algorithm
def cluster_predict(label, min_pts = 'auto'):
    """
        Input: an array of clsutered label for each instance
        return: an array of anomal label for each instance
    """
    try:
        # Get Unqiue label and its counts
        (unique, counts) = np.unique(label, return_counts = True)
    
        # Define minimum points that it should have in a cluster, if auto, it will take the min count
        if min_pts == 'auto':
            min_pts = min(counts)
            print('Minimum points of a cluster among the clusters: ', min_pts)
        else:
            min_pts = int(min_pts)

        # Prepare label_dict for mapping
        label_dict = {label: 0 if count > min_pts else 1 for label, count in zip(unique, counts)}

        # Map label_dict to label
        return np.array([label_dict[i] for i in label])
    except Exception as e:
        print(e)
        return None


cluster_predict(kmeans.labels_)


Dimension of sparse_matrix is  (2020, 709)


In [8]:
import pandas as pd
import numpy as np
import platform
import re
from collections import Counter

if platform.system() == 'Windows':
    path = '.\data\STOCK.csv'
else:
    path = '/Users/CliffordMan/Downloads/project/STOCK.csv'

def get_truth_label(path, threshold = 0.4):
    # Load dataset
    df = pd.read_csv(path)

    # preprocess the data in order to get a proper data structure
    df = df.set_index('Unnamed: 0').transpose().dropna()
    df = df.reset_index()
    df['index'] = df['index'].apply(lambda x: retrieve_stock_code(x))
    df = df.set_index('index')

    # Define col_dim and empty dataframe
    col_dim = len(df.columns)
    temp = pd.DataFrame()

    # Create a list of column name without the first element
    first_dim = df.columns[0]
    col_list = df.columns.to_list()
    col_list.remove(first_dim)

    for col in col_list:
        # Assign the col to second_dim, as current date
        second_dim = col

        # Calculate the daily % change of stock price
        
        temp[col] = (df[second_dim] - df[first_dim]) / df[first_dim]

        # Assign the col to first dim, as previous date
        first_dim = col

    result = np.sum(temp > threshold, axis = 1)

    return {stock_code:1 if count > 0 else 0 for stock_code, count in result.items()}

def retrieve_stock_code(x):
    d = re.search('[0-9]*', x)
    if d:
        return ('00000' + d.group(0))[-5:]
    else:
        return None
truth_label = get_truth_label(path)
res = Counter(truth_label.values())
print(res)
data = list(res.values())



Counter({0: 1946, 1: 74})


In [14]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix

prediction =cluster_predict(clustering.labels_).counts
truth_label=data



# print precision, recall and return F_score
def f_score(y_truth, y_pred, beta = 1):
    
    try:
        # Run confusion_matrix
        tn, fp, fn, tp = confusion_matrix(y_truth, y_pred).ravel()
        
        precision_value = precision(tp, fp)
        recall_value = recall(tp, fn)
        # print recall
        print('True positive: {}, True Negative: {}, False Positive: {}, False Negative: {}'.format(tp, tn, fp, fn))
        print('Precision is ', format(precision_value * 100, '.2f'), '%')
        print('Recall is ', format(recall_value * 100, '.2f'), '%')
        
        return (1 + beta**2) * (precision_value * recall_value) / ((beta**2 * precision_value + recall_value))
    except Exception as e:
        print(e)
        return None

# return precision
def precision(tp, fp):
    return tp / (tp + fp)

# return recall():
def recall(tp, fn):
    return tp / (tp + fn)
    
# display ROC curve and return AUC
def ROC_AUC(y_truth, y_pred):
    pass

# The higher the f-score, the better the model
# input the truth label first and then your prediction
f_score(truth_label, prediction, 1)

too many values to unpack (expected 4)
