In [1]:
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, CuDNNLSTM, Dropout
from tensorflow.keras.utils import to_categorical
import EstimatorPreprocessor as ep
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse
from keras.models import load_model
from sklearn.metrics import classification_report

In [2]:
data = ep.load_cleaned_submissions()
y = ep.encode_labels(data)

In [3]:
new_sentences = []
sentences = data["text"].tolist()
max_sentence_len = ep.max_sentence_length(sentences, truncate=True, max_len = 90)
for sentence in sentences:
    new_sentences.append(sentence[:max_sentence_len])
sentences = new_sentences
del new_sentences

In [None]:
# Can maybe be removed
# Todo: Loop over hyperparameters
vec_size = [50, 100, 200]
min_c = [1]
w = [1, 2, 3]

for vec in vec_size:
    for mc in min_c:
        for win in w:
            print(vec, mc, win)
            word_model, pretrained_weights, vocab_size, embedding_size = ep.embedding_word2vec(sentences, vec_size = vec, min_c = mc, w = win)
            for word in ['moon', 'short', 'robinhood', 'andromeda', 'ape', '🦍']:
                most_similar = ', '.join('%s (%.2f)' % (similar, dist) 
                                        for similar, dist in word_model.wv.most_similar(word)[:3])
                print('  %s -> %s' % (word, most_similar))
            print("-----------------"*5)



In [4]:
# Replicate model with best hyperparameters
word_model, pretrained_weights, vocab_size, embedding_size = ep.embedding_word2vec(sentences, vec_size = 50, min_c = 1, w = 1)

In [12]:
for word in ['robinhood', 'andromeda', 'ape', 'hedgefund', '🦍']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) 
                           for similar, dist in word_model.wv.most_similar(word)[:3])
  print('  %s -> %s' % (word, most_similar))

  robinhood -> rh (0.98), etrade (0.87), webull (0.86)
  andromeda -> jupiter (0.95), mars (0.94), uranus (0.93)
  ape -> autist (0.94), monkey (0.91), retard (0.90)
  hedgefund -> hfs (0.93), hf (0.89), shorter (0.88)
  🦍 -> 🍌 (0.91), 🦧 (0.91), 🐒 (0.87)


In [6]:
X_train_lstm = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
y_train_lstm = np.zeros([len(sentences)], dtype=np.int32)

In [7]:
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence):
    X_train_lstm[i, t] = ep.word2idx(word_model, word)
  #y_train_lstm[i] = word2idx(sentence[-1])

In [8]:
y_train = to_categorical(y)

In [9]:
X_train_lstm, X_test_lstm, y_train, y_test = train_test_split(X_train_lstm, y_train, stratify=y_train)

In [10]:
def build_model(d = 0.25, opt = "adam"):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = embedding_size, weights = [pretrained_weights]))
    model.add(CuDNNLSTM(units = embedding_size))
    model.add(Dropout(d))
    model.add(Dense(3, activation = "softmax"))
    model.compile(opt, "categorical_crossentropy", metrics = ["acc"])
    return model

In [12]:
acc = []
val_acc = []
loss = []
val_loss = []
param_list = []

dropout = [0, 0.25, 0.5]
optimi = ["rmsprop", "SGD", "Adam"]

# Best model -> also change epochs
dropout = [0]
optimi = ["Adam"]

for d in dropout:
    for opt in optimi:
        model = build_model(d)
        history = model.fit(X_train_lstm, y_train, epochs = 3, validation_split = 0.2, batch_size = 64, verbose = 2)
        # acc.append(history.history['acc'])
        # val_acc.append(history.history['val_acc'])
        # loss.append(history.history['loss'])
        # val_loss.append(history.history['val_loss'])
        param_list.append("Dropout: " + str(d) + 
                            " & Optimizer:" + opt +
                            " ;Train Acc: " + str(history.history['acc']) + 
                            " ;Train Loss: "+ str(history.history['loss']) + 
                            " ;Val Acc:" + str(history.history['val_acc']) +
                            " ;Val Loss:" + str(history.history['val_loss'])
                            )
        history.model.save("./model/lstm/lstm" + str(d) + "_" + opt + ".h5")

Epoch 1/3
1684/1684 - 17s - loss: 0.7477 - acc: 0.7090 - val_loss: 0.4413 - val_acc: 0.8338 - 17s/epoch - 10ms/step
Epoch 2/3
1684/1684 - 16s - loss: 0.3582 - acc: 0.8680 - val_loss: 0.3202 - val_acc: 0.8878 - 16s/epoch - 9ms/step
Epoch 3/3
1684/1684 - 15s - loss: 0.2404 - acc: 0.9173 - val_loss: 0.3094 - val_acc: 0.8930 - 15s/epoch - 9ms/step


In [40]:
textfile = open("./eval_model/lstm_final.txt", "w")
e = " ;Epoch: " + str(list(np.arange(1, len(history.history["acc"])+1, 1)))
for item in param_list:
    textfile.write(item + e + "\n")
textfile.close()

In [13]:
# Save all parameters to text file
ep.save_param_list_keras(path = "./eval_model/lstm_final.txt", p_list = param_list)

In [14]:
test_pred = model.predict(X_test_lstm)

In [12]:
# Retrain model with best metrics and given hyperparameters then
# print classification report
loaded_model = load_model('./model/lstm/lstm0.5.h5') # First retrain model and then load the appropriate one
test_pred = loaded_model.predict(X_test_lstm)

OSError: No file or directory found at ./model/lstm/lstm0.5.h5

In [15]:
converted_test_pred = []
for i in range(len(test_pred)):
    maxi = np.argmax(test_pred[i])
    arr = np.zeros((3))
    arr[maxi] = 1
    converted_test_pred.append(arr)

In [16]:
print(classification_report(y_test, converted_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      6022
           1       0.93      0.94      0.93     30278
           2       0.82      0.80      0.81      8586

   micro avg       0.89      0.89      0.89     44886
   macro avg       0.86      0.85      0.85     44886
weighted avg       0.89      0.89      0.89     44886
 samples avg       0.89      0.89      0.89     44886

