In [63]:
import sys
import platform
import re

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest

import plotly.express as px

from utils import *

# Data Preparation
- This Step is to prepare a matrix that suitable to train the model
- The matrix is a sparse matrix which contains a lot of zero elements in the matrix
- In the matrix, each row is representing a stock code while each column is representing a shareholder
- The value of each i,j is representing how many share a shareholder holds for the particular stock

| Example | Shareholder1 | Shareholder1 |
| --- | --- | --- |
| Stock1 | shares | shares |
| Stock2 | shares | shares |
| Stock3 | shares | shares |
| ... | ... | ... |

In [64]:
# Define data_path
if platform.system() == 'Windows':
    data_path = '.\data\stock_port.csv'
else:
    data_path = './data/stock_port.csv'
    
# Load dataset from data path
df = load_data(data_path)

# Convert df to sparse matrix
sp_matrix, row_ind_dict, col_ind_dict = convert_data_sparse_matrix(df)

# List Basic Info
print('Dimension of sparse_matrix: ', sp_matrix.shape)
row_dim = sp_matrix.shape[0]
col_dim = sp_matrix.shape[1]
print('How many stock do we have in the dataset: ', row_dim)
print('How many unique shareholder do we have in the dataset', col_dim)

Dimension of sparse_matrix:  (2020, 709)
How many stock do we have in the dataset:  2020
How many unique shareholder do we have in the dataset 709


# Data Preprocessing

- This step is to normalize each column by calculating the shareholding % of each stock
- Sum over the column should result in one

In [65]:
# Calculate shareholding % by stock_code
sp_matrix_stock = sp_matrix / np.sum(sp_matrix, axis = 1).reshape(row_dim, -1)

# Sum of column should result one
np.sum(sp_matrix_stock, axis = 1)

array([1., 1., 1., ..., 1., 1., 1.])

# Data Visualization

- This step is to visualize how it looks like of the data
- High dimensional data is hardly visualize due to physical constraints, however it has technique to mimic the high dimensional data into low dimensional data
- We use t-SNE to visualize the high dimensional data into 2-D space

In [66]:
# Apply TSNE to sp_matrix_stock
dim = 2
perplexity = 100
learning_rate = 200
X_embedded = TSNE(n_components = dim, perplexity = perplexity, \
                  learning_rate = learning_rate, init='pca').fit_transform(sp_matrix_stock)

In [67]:
# Visualize the result with stcok code label
df_tsne = pd.DataFrame(X_embedded, columns = ['X1', 'X2'])
df_tsne['stock_code'] = df_tsne.reset_index()['index']\
                        .apply(lambda x: {j:i for i,j in row_ind_dict.items()}[x])
    
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code')
fig.show()

# Clustering - K-Mean

- After visualizing the data, the data is then fit into three different kinds of models, K-mean, DBScan and Isolation Forest. 
- Since K-mean and DBScan are clustering techniques, they do not have predictive ability, we have derive an algorithm to predict whether the stock is anomaly
- We will then visualize the clustering result on the 2-D matrix from t-SNE

In [68]:
# Apply K-Mean to sp_matrix_stock
n_clusters = 30
kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(sp_matrix_stock)

In [69]:
# Map the label back to df_tsne in order to visualize the result in two dimensional space
df_tsne['label'] = kmeans.labels_
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

# Prediction

- Since K-means is a clustering technique, it doesn't have the ability to predict, , to detect anomaly stock, we have derived a simple algorithm for prediction
- The predictive algorithm defines that the cluster with data points that equal to or below a pre-set threshold, all data points in that cluster are treated as anomaly
- We default set the threshold as the number of data points in the clsuter that has the minimum points

In [70]:
min_pts = 25
prediction = cluster_predict(kmeans.labels_, min_pts = min_pts)

# Define the Truth label
- Before evaluating how good the model performs, we have to define the truth label
- We have scraped daily closing price of each stock from 2020-09-01 to 2020-11-11
- We define stock as 'anomaly' if it happened a sudden rise of stock price by 30% daily

In [71]:
# Define the path
if platform.system() == 'Windows':
    truth_path = '.\data\STOCK.csv'
else:
    truth_path = './data/STOCK.csv'

In [72]:
# Get the truth_label
truth_label = get_truth_label(truth_path)

# Convert to index-to-label dict
ind_label_dict = {}
for stock_code, ind in row_ind_dict.items():
    try:
        ind_label_dict[ind] = truth_label[stock_code]
    except:
        # For the exception that we could not get that stock's price
        ind_label_dict[ind] = 0
        
# Map back to prediction
kmeans_result = np.array([(prediction, ind_label_dict[ind]) for ind, prediction \
          in enumerate(list(prediction))])

y_pred = kmeans_result[:,0]
y_truth = kmeans_result[:,1]

print('Number of positive examples in our prediction: ', y_pred.sum())
print('Number of positive examples in our truth label: ', y_truth.sum())

Number of positive examples in our prediction:  260
Number of positive examples in our truth label:  120


# Confusion Matrix
- After we get the y_pred and y_truth, we evaluate the model by using confusion matrix
- As the task is anomaly detection of which positive label (y = 1) is relatively small, the dataset is in fact a imbalanced dataset that requires confusion matrix to evaluate it
- F-score is used for a single measurement of confusion matrix in order to optimize the hyperparameters of the model

In [73]:
f_score(y_truth, y_pred)

True positive: 19, True Negative: 1659, False Positive: 241, False Negative: 101
Precision is  7.31 %
Recall is  15.83 %


0.09999999999999999

# DBSCAN
- We repeat the process by fitting DBSCAN

In [74]:
# Apply DBSCAN
clustering = DBSCAN(eps=0.2, min_samples=20).fit(sp_matrix_stock)
clustering.labels_

array([ 0,  0,  0, ..., -1, -1, -1])

In [75]:
df_tsne['label'] = clustering.labels_
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

In [76]:
min_pts = 30
prediction = cluster_predict(clustering.labels_, min_pts=min_pts)

# Get the truth_label
truth_label = get_truth_label(truth_path)

# Convert to index-to-label dict
ind_label_dict = {}
for stock_code, ind in row_ind_dict.items():
    try:
        ind_label_dict[ind] = truth_label[stock_code]
    except:
        # For the exception that we could not get that stock's price
        ind_label_dict[ind] = 0
        
# Map back to prediction
dbscan_result = np.array([(prediction, ind_label_dict[ind]) for ind, prediction \
          in enumerate(list(prediction))])

y_pred = dbscan_result[:,0]
y_truth = dbscan_result[:,1]

print('Number of positive examples in our prediction: ', y_pred.sum())
print('Number of positive examples in our truth label: ', y_truth.sum())

Number of positive examples in our prediction:  340
Number of positive examples in our truth label:  120


In [77]:
f_score(y_truth, y_pred)

True positive: 29, True Negative: 1589, False Positive: 311, False Negative: 91
Precision is  8.53 %
Recall is  24.17 %


0.12608695652173915

# Isolation Forest
- We repeat the process by fitting Isolation Forest
- Isolation Forest does not need a predictive algorithm since it has the ability to predict

In [78]:

clf = IsolationForest(n_estimators=200, max_features=100, contamination=0.1, 
                      max_samples=256,random_state=0).fit(sp_matrix_stock)
label = clf.predict(sp_matrix_stock)


Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.



In [79]:
df_tsne['label'] = label
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()

In [80]:
prediction = [1 if i == -1 else 0 for i in label]

# Get the truth_label
truth_label = get_truth_label(truth_path)

# Convert to index-to-label dict
ind_label_dict = {}
for stock_code, ind in row_ind_dict.items():
    try:
        ind_label_dict[ind] = truth_label[stock_code]
    except:
        # For the exception that we could not get that stock's price
        ind_label_dict[ind] = 0
        
# Map back to prediction
iso_result = np.array([(prediction, ind_label_dict[ind]) for ind, prediction \
          in enumerate(list(prediction))])

y_pred = iso_result[:,0]
y_truth = iso_result[:,1]

In [81]:
f_score(y_truth, y_pred)

True positive: 15, True Negative: 1705, False Positive: 195, False Negative: 105
Precision is  7.14 %
Recall is  12.50 %


0.09090909090909091

In [82]:
# Take a look where is the truth label
df_tsne['label'] = y_truth
fig = px.scatter(df_tsne,'X1','X2', hover_name = 'stock_code', color = 'label')
fig.show()