In [1]:
%run get_vocab.py

In [2]:
import pandas as pd
import numpy as np
import os
import subprocess

data_dir = './Data'

In [2]:
# load data and preview

rawdata = pd.read_csv(data_dir + './data.csv')
rawdata[0:5]

Unnamed: 0.1,Unnamed: 0,gt_val,gt_sym,lt_val,lt_sym
0,2019-01-03,"[0.8706872487325694, 0.6136363069873223, 0.495...","['PHUN', 'NBSE', 'OCGN', 'BOXL', 'KTOV', 'BTAI...","([-0.3673469462229876, -0.26984126683776033, -...","['SESN', 'CCCL', 'SELB', 'ATRA', 'LTRPB', 'SFE..."
1,2019-01-04,"[1.0265407170232148, 0.6296295293346674, 0.327...","['PHUN', 'CGIX', 'EPZM', 'DRRX', 'TMDI', 'SUNW...","([-0.1925466206425025, -0.1573959338395985, -0...","['RBZ', 'TVIX', 'ELGX', 'ZBIO', 'BOXL', 'SQQQ'..."
2,2019-01-07,"[1.612167143201626, 0.4767024881329074, 0.4323...","['AXSM', 'ADXS', 'INSM', 'SAGE', 'PDSB', 'RBZ'...","([-0.5494845390218467, -0.5019011841153402, -0...","['CFRX', 'LJPC', 'NHTC', 'AXGN', 'CUTR', 'GSUM..."
3,2019-01-08,"[1.0444301396706108, 0.4708994668947917, 0.364...","['CLPS', 'MBOT', 'YI', 'PHUN', 'PLXP', 'ORGO',...","([-0.7539682502138672, -0.2285713847802603, -0...","['PROF', 'BRPAR', 'CFRX', 'ECOR', 'GWGH', 'AEY..."
4,2019-01-09,"[5.127232220625748, 1.5882352941176472, 1.4078...","['ORGO', 'PHUN', 'WINS', 'SAEX', 'VIVE', 'ATOS...","([-0.2503917207560298, -0.16546764132411507, -...","['SGH', 'MBOT', 'CLPS', 'AXSM', 'SGMO', 'MRIN'..."


In [3]:
# number of word in a long sentence
n_top = 500

# number of word in a sentence of embedding training set
n_similar = 20

# number of sentence in a sentence of embedding training set
n_sentence = 50000 

# function to get top 500 and bottom 500 for each day
def get_top(x, n=n_top):
    return(x.replace("[", "").replace("]", "").replace("'", "").replace(",", "").split(" ")[0:n])

# function to make the long sentence shorter, from 500 to 20
def get_similar(x, n=n_similar):
    ind = np.random.choice(len(x), n, replace=False)
    return( [x[i] for i in ind] )

In [4]:
# total number of unique long sentences

gt_top_list = [get_top(x, n_top) for x in rawdata.gt_sym]
lt_top_list = [get_top(x, n_top) for x in rawdata.lt_sym]
total_list = gt_top_list+lt_top_list
len(total_list)

502

In [5]:
# get 50000 long sentences from 502 unique long sentences
# then get 50000 short sentences with each containing 20 words

np.random.seed(1993)

ind = np.random.choice(len(total_list), n_sentence, replace=True)
output_list = [get_similar(total_list[i], n_similar) for i in ind]

print(len(output_list))
output_list[0]

50000


['MDGL',
 'RAND',
 'CCCL',
 'VOXX',
 'RIOT',
 'MBCN',
 'ATLO',
 'SRRK',
 'JRJC',
 'BVSN',
 'THMO',
 'LIND',
 'NEWA',
 'HX',
 'ANTE',
 'BIB',
 'JRSH',
 'SES',
 'ALDX',
 'MOGO']

In [6]:
# generate training set of embedding

output = [element for lis in output_list for element in lis]
unique_stocks = set(output)
print(len(output))
print(len(unique_stocks))

1000000
2928


In [7]:
out_dir = './Data/stocks_emb_train.txt'
with open(out_dir, 'w', encoding='utf-8') as fout:
        for word in output:
            fout.write(word+' ')


In [11]:
# use fastText to train

cmd = "./fasttext skipgram -epoch 5 -minCount 0 -dim 100 -thread 12 -ws 5 -neg 5 -input Data/stocks_emb_train.txt -output Data/stocks_emb 1>data/stocks_emb.log 2>&1"

result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
print(result.stdout)
print(result.stderr)





In [5]:
# get .w2vec for next step
%run generate_d2gpo_embedding.py ./Data/stocks_emb.bin ./Data/stocks_symbols.vocab ./Data/stocks_emb.vec ./Data/stocks_emb.w2vec


