In [5]:
import sys
import platform
import re

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix

import plotly.express as px



def convert_data_sparse_matrix(df, row_label='stock_code', col_label='name_of_ccass_participant',
                               value_label='shareholding'):
    """
        Pivot table
    """
    try:
        # Prepare zero matrix
        row_dim = len(df[row_label].unique())
        col_dim = len(df[col_label].unique())
        sparse_matrix = np.zeros((row_dim, col_dim))

        # Prepare label to index dictionaries
        row_ind_dict = {label: ind for ind, label in enumerate(sorted(df[row_label].unique().tolist()))}
        col_ind_dict = {label: ind for ind, label in enumerate(sorted(df[col_label].unique().tolist()))}

        # Transform row_label column and col_label column to index
        df['row_ind'] = df[row_label].apply(lambda x: row_ind_dict[x])
        df['col_ind'] = df[col_label].apply(lambda x: col_ind_dict[x])

        for ind, row in df.iterrows():
            # Get index and shareholding
            row_ind = row['row_ind']
            col_ind = row['col_ind']
            value = row[value_label]

            # Assign to sparse matrix
            sparse_matrix[row_ind, col_ind] += value

        return sparse_matrix, row_ind_dict, col_ind_dict
    except Exception as e:
        print(e)
        return None


def load_data(data_path):
    # Read csv files
    df = pd.read_csv(data_path)

    # Convert stock code to formatted string
    df['stock_code'] = df['stock_code'].apply(lambda x: ('00000' + str(x))[-5:])

    return df
# Define data_path
if platform.system() == 'Windows':
    data_path = '.\data\stock_port.csv'
else:
    data_path = '/Users/CliffordMan/Downloads/project/stock_port.csv'
    
# Load dataset from data path
df = load_data(data_path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# List Basic Info
print('Dimension of sparse_matrix: ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]
print('How many stock do we have in the dataset: ', row_dim)
print('How many unique shareholder do we have in the dataset', col_dim)
# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Sum of column should result one
np.sum(sp_matrix_stock, axis = 1)

# Apply TSNE to sp_matrix_stock
dim = 2
perplexity = 100
learning_rate = 200
X_embedded = TSNE(n_components = dim, perplexity = perplexity, \
                  learning_rate = learning_rate, init='pca').fit_transform(sp_matrix_stock)

# Visualize the result with stcok code label
df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index']\
                        .apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])
    
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code')
fig.show()

# Apply K-Mean to sp_matrix_stock
n_clusters =120
kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(sp_matrix_stock)


# Map the label back to df_tsne in order to visualize the result in two dimensional space
df_tsne['label'] = kmeans.labels_
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

# Predictive Algorithm
def cluster_predict(label, min_pts = 'auto'):
    """
        Input: an array of clsutered label for each instance
        return: an array of anomal label for each instance
    """
    try:
        # Get Unqiue label and its counts
        (unique, counts) = np.unique(label, return_counts = True)
    
        # Define minimum points that it should have in a cluster, if auto, it will take the min count
        if min_pts == 'auto':
            min_pts = min(counts)
            print('Minimum points of a cluster among the clusters: ', min_pts)
        else:
            min_pts = int(min_pts)

        # Prepare label_dict for mapping
        label_dict = {label: 0 if count > min_pts else 1 for label, count in zip(unique, counts)}

        # Map label_dict to label
        return np.array([label_dict[i] for i in label])
    except Exception as e:
        print(e)
        return None

min_pts = 70
prediction = cluster_predict(kmeans.labels_, min_pts = min_pts)

# Define the path
if platform.system() == 'Windows':
    truth_path = '.\data\STOCK.csv'
else:
    truth_path = '/Users/CliffordMan/Downloads/project/STOCK.csv'
    
def get_truth_label(path, threshold = 0.3):
    # Load dataset
    df = pd.read_csv(path)

    # preprocess the data in order to get a proper data structure
    df = df.set_index('Unnamed: 0').transpose().dropna()
    df = df.reset_index()
    df['index'] = df['index'].apply(lambda x: retrieve_stock_code(x))
    df = df.set_index('index')

    # Define col_dim and empty dataframe
    col_dim = len(df.columns)
    temp = pd.DataFrame()

    # Create a list of column name without the first element
    first_dim = df.columns[0]
    col_list = df.columns.to_list()
    col_list.remove(first_dim)

    for col in col_list:
        # Assign the col to second_dim, as current date
        second_dim = col

        # Calculate the daily % change of stock price
        temp[col] = (df[second_dim] - df[first_dim]) / df[first_dim]

        # Assign the col to first dim, as previous date
        first_dim = col

    result = np.sum(temp > threshold, axis = 1)

    return {stock_code:1 if count > 0 else 0 for stock_code, count in result.items()}

def retrieve_stock_code(x):
    d = re.search('[0-9]*', x)
    if d:
        return ('00000' + d.group(0))[-5:]
    else:
        return None
    # Get the truth_label
truth_label = get_truth_label(truth_path)

# Convert to index-to-label dict
ind_label_dict = {}
for stock_code, ind in row_ind_dict.items():
    try:
        ind_label_dict[ind] = truth_label[stock_code]
    except:
        # For the exception that we could not get that stock's price
        ind_label_dict[ind] = 0
        
# Map back to prediction
kmeans_result = np.array([(prediction, ind_label_dict[ind]) for ind, prediction \
          in enumerate(list(prediction))])

y_pred = kmeans_result[:,0]
y_truth = kmeans_result[:,1]

print('Number of positive examples in our prediction: ', y_pred.sum())
print('Number of positive examples in our truth label: ', y_truth.sum())



# print precision, recall and return F_score
def f_score(y_truth, y_pred, beta = 1):
    
    try:
        # Run confusion_matrix
        tn, fp, fn, tp = confusion_matrix(y_truth, y_pred).ravel()
        
        precision_value = precision(tp, fp)
        recall_value = recall(tp, fn)
        # print recall
        print('True positive: {}, True Negative: {}, False Positive: {}, False Negative: {}'.format(tp, tn, fp, fn))
        print('Precision is ', format(precision_value * 100, '.2f'), '%')
        print('Recall is ', format(recall_value * 100, '.2f'), '%')
        
        return (1 + beta**2) * (precision_value * recall_value) / ((beta**2 * precision_value + recall_value))
    except Exception as e:
        print(e)
        return None

# return precision
def precision(tp, fp):
    return tp / (tp + fp)

# return recall():
def recall(tp, fn):
    return tp / (tp + fn)
    
# display ROC curve and return AUC
def ROC_AUC(y_truth, y_pred):
    pass

f_score(y_truth, y_pred)

Dimension of sparse_matrix:  (2020, 709)
How many stock do we have in the dataset:  2020
How many unique shareholder do we have in the dataset 709


Number of positive examples in our prediction:  1930
Number of positive examples in our truth label:  120
True positive: 116, True Negative: 86, False Positive: 1814, False Negative: 4
Precision is  6.01 %
Recall is  96.67 %


0.11317073170731708

In [8]:
# Apply DBSCAN
clustering = DBSCAN(eps=0.5, min_samples=100).fit(sp_matrix_stock)
clustering.labels_

df_tsne['label'] = clustering.labels_
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

min_pts = 'auto'
prediction = cluster_predict(clustering.labels_, min_pts=min_pts)

# Get the truth_label
truth_label = get_truth_label(truth_path)

# Convert to index-to-label dict
ind_label_dict = {}
for stock_code, ind in row_ind_dict.items():
    try:
        ind_label_dict[ind] = truth_label[stock_code]
    except:
        # For the exception that we could not get that stock's price
        ind_label_dict[ind] = 0
        
# Map back to prediction
dbscan_result = np.array([(prediction, ind_label_dict[ind]) for ind, prediction \
          in enumerate(list(prediction))])

y_pred = dbscan_result[:,0]
y_truth = dbscan_result[:,1]

print('Number of positive examples in our prediction: ', y_pred.sum())
print('Number of positive examples in our truth label: ', y_truth.sum())

f_score(y_truth, y_pred)

Minimum points of a cluster among the clusters:  210
Number of positive examples in our prediction:  210
Number of positive examples in our truth label:  120
True positive: 19, True Negative: 1709, False Positive: 191, False Negative: 101
Precision is  9.05 %
Recall is  15.83 %


0.11515151515151514

In [10]:
clf = IsolationForest(n_estimators=150, random_state=0).fit(sp_matrix_stock)
label = clf.predict(sp_matrix_stock)

df_tsne['label'] = label
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

prediction = [1 if i == -1 else 0 for i in label]

# Get the truth_label
truth_label = get_truth_label(truth_path)
 
# Convert to index-to-label dict
ind_label_dict = {}
for stock_code, ind in row_ind_dict.items():
    try:
        ind_label_dict[ind] = truth_label[stock_code]
    except:
        # For the exception that we could not get that stock's price
        ind_label_dict[ind] = 0
        
# Map back to prediction
iso_result = np.array([(prediction, ind_label_dict[ind]) for ind, prediction \
          in enumerate(list(prediction))])

y_pred = iso_result[:,0]
y_truth = iso_result[:,1]

f_score(y_truth, y_pred)

True positive: 0, True Negative: 1900, False Positive: 0, False Negative: 120
Precision is  nan %
Recall is  0.00 %



invalid value encountered in long_scalars



nan

In [4]:

# Take a look where is the truth label
df_tsne['label'] = y_truth
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()