In [1]:
# VADER: https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-rule-based-vader-and-nltk-72067970fb71

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse

In [2]:
def load_cleaned_submissions():
    data = pd.read_pickle("./data/cleaned_submissions.pkl")
    data = data.loc[data["sentiment"] != "", ["text", "sentiment"]] # Only get labeled instances
    return data

In [3]:
def encode_vectorize_data(data):
    # Encode labels
    lenc = LabelEncoder()
    y = lenc.fit_transform(data["sentiment"])

    # Vectorize text using tfidf
    tfidf = TfidfVectorizer(preprocessor=' '.join, lowercase=False, min_df=5) # min_df = Minimum occurance of words
    X = tfidf.fit_transform(data["text"])
    return X, y

In [4]:
def load_sparse_matrices():
    X = scipy.sparse.load_npz('./data/X_sparse.npz')
    y = np.load("./data/y_sparse.npy")
    return X, y

In [5]:
def split_train_test(X, y):
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
    return X_train, X_test, y_train, y_test

# SVM

In [14]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline = Pipeline([
        ("svc", SVC())
    ])

param_grid = {"svc__kernel": ["poly"], "svc__C": [3]} #["poly", "rbf", "sigmoid", "linear"], [3, 4, 5, 6, 7]
CV = GridSearchCV(pipeline, param_grid, cv = 5)
# pipeline.get_params().keys() See all available parameters
CV.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('svc', SVC())]),
             param_grid={'svc__C': [3], 'svc__kernel': ['poly']})

In [17]:
import pickle
pickle.dump(CV, open("./data/svm_first_model_poly_3.sav", "wb"))

In [18]:
loaded_model = pickle.load(open("./data/svm_first_model_poly_3.sav", "rb"))

In [19]:
print("Accuracy of {} can be achieved with the following parameters: {}".format(loaded_model.score(X_test, y_test), CV.best_params_))

Accuracy of 0.8837895792141246 can be achieved with the following parameters: {'svc__C': 3, 'svc__kernel': 'poly'}


# Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline = Pipeline([
        ("mnb", MultinomialNB())
    ])

param_grid = {"mnb__alpha": [1]}
CV = GridSearchCV(pipeline, param_grid, cv = 5)
# pipeline.get_params().keys() See all available parameters
CV.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('mnb', MultinomialNB())]),
             param_grid={'mnb__alpha': [1]})

In [23]:
import pickle
pickle.dump(CV, open("./data/nb_first_model_mnb.sav", "wb"))

In [24]:
loaded_model = pickle.load(open("./data/nb_first_model_mnb.sav", "rb"))

In [25]:
print("Accuracy of {} can be achieved with the following parameters: {}".format(loaded_model.score(X_test, y_test), CV.best_params_))

Accuracy of 0.7632626918042831 can be achieved with the following parameters: {'mnb__alpha': 1}


# LSTM

In [6]:
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.utils import to_categorical

In [247]:
# Begin DEV #
# This DEV actually works!!! Use this!!!
# https://stackoverflow.com/questions/42064690/using-pre-trained-word2vec-with-lstm-for-word-generation

In [294]:
data = load_cleaned_submissions()

In [295]:
from sklearn.preprocessing import LabelEncoder
lenc = LabelEncoder()
y_train = lenc.fit_transform(data["sentiment"].tolist())

In [296]:
sentences = data["text"].tolist()

In [297]:
max_sentence_len = 0
for sentence in sentences:
    if len(sentence) > max_sentence_len:
        max_sentence_len = len(sentence)
print(max_sentence_len)

2934


https://www.quora.com/What-are-the-strategies-to-deal-with-different-length-of-sentences-for-RNN-or-LSTM
1) If the sentences are too long, try to create an embedding that maps the words to a smaller feature space. Take a look at GloVe embeddings, Word2Vec, etc.

2) Increase the depth of the RNN. As the sequence length gets longer, it becomes harder and harder for a single layered LSTM to process the dependencies in the data. Adding more hidden layers greatly helps.

In [298]:
# Manually set it lower, because 2934 words in a sentence is quite a lot!!!
max_sentence_len = 90 # set it to 100 to have less dimensions for dev purposes

In [299]:
new_sentences = []
for sentence in sentences:
    new_sentences.append(sentence[:max_sentence_len])
sentences = new_sentences
del new_sentences

In [300]:
import gensim
word_model = gensim.models.Word2Vec(sentences, vector_size=200, min_count=1, window=5) # Vector_size = number of words??? Check params!!!

In [301]:
pretrained_weights = word_model.wv.vectors

In [302]:
vocab_size, emdedding_size =pretrained_weights.shape

In [303]:
for word in ['moon', 'short', 'robinhood', 'andromeda', 'ape', '🦍']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) 
                           for similar, dist in word_model.wv.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))

  moon -> rocketship (0.98), brrrrr (0.97), rocket (0.97), andromeda (0.97), sticker (0.97), gooooo (0.96), baby (0.96), thinkcheersps (0.96)
  short -> 138 (0.94), interest (0.93), cover (0.92), seller (0.92), float (0.92), fridays (0.92), 22642 (0.92), unwind (0.91)
  robinhood -> rh (0.98), app (0.98), zealand (0.98), webull (0.98), broker (0.98), allow (0.98), purchase (0.98), uk (0.97)
  andromeda -> rocket (0.99), pluto (0.99), mars (0.99), rocketship (0.99), baby (0.98), galaxy (0.98), uranus (0.98), mooooon (0.98)
  ape -> fellow (0.98), autist (0.95), retard (0.95), fighting (0.94), grandkid (0.94), stay (0.94), together (0.93), banana (0.93)
  🦍 -> 🍌 (0.99), 🦧 (0.98), 💪 (0.98), 🙏 (0.98), ✋ (0.98), 🤝 (0.98), 🤲 (0.98), 🏻 (0.98)


In [304]:
def word2idx(word):
  return word_model.wv.key_to_index[word]
def idx2word(idx):
  return word_model.wv.index_to_key[idx]

In [305]:
x_train_lstm = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
y_train_lstm = np.zeros([len(sentences)], dtype=np.int32)

In [306]:
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence):
    x_train_lstm[i, t] = word2idx(word)
  #y_train_lstm[i] = word2idx(sentence[-1])

In [307]:
print(y_train[0], data["sentiment"][0], to_categorical(y_train)[0])
print(y_train[-2], list(data["sentiment"])[-2], to_categorical(y_train)[-2])
print(y_train[6], data["sentiment"][60], to_categorical(y_train)[6])

1 bullish [0. 1. 0.]
2 neutral [0. 0. 1.]
0 bearish [1. 0. 0.]


In [308]:
y_train = to_categorical(y_train)

In [311]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, CuDNNLSTM, Dropout

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = emdedding_size, weights = [pretrained_weights]))
model.add(CuDNNLSTM(units = emdedding_size))
model.add(Dropout(0.25))
model.add(Dense(3, activation = "softmax"))
model.compile("adam", "categorical_crossentropy", metrics = ["acc"])
model.fit(x_train_lstm, y_train, epochs=20, validation_split=0.2, batch_size=64, verbose=2)

Epoch 1/20
225/225 - 5s - loss: 1.0239 - acc: 0.5100 - val_loss: 1.0158 - val_acc: 0.5358 - 5s/epoch - 21ms/step
Epoch 2/20
225/225 - 3s - loss: 1.0146 - acc: 0.5177 - val_loss: 1.0285 - val_acc: 0.5355 - 3s/epoch - 15ms/step
Epoch 3/20
225/225 - 3s - loss: 1.0115 - acc: 0.5198 - val_loss: 1.0485 - val_acc: 0.5336 - 3s/epoch - 15ms/step
Epoch 4/20
225/225 - 3s - loss: 0.9934 - acc: 0.5333 - val_loss: 1.0403 - val_acc: 0.5341 - 3s/epoch - 15ms/step
Epoch 5/20
225/225 - 3s - loss: 0.9152 - acc: 0.5654 - val_loss: 0.9727 - val_acc: 0.5338 - 3s/epoch - 15ms/step
Epoch 6/20
225/225 - 3s - loss: 0.7072 - acc: 0.7018 - val_loss: 0.8926 - val_acc: 0.6352 - 3s/epoch - 15ms/step
Epoch 7/20
225/225 - 3s - loss: 0.4820 - acc: 0.8058 - val_loss: 0.9591 - val_acc: 0.6332 - 3s/epoch - 15ms/step
Epoch 8/20
225/225 - 3s - loss: 0.3706 - acc: 0.8570 - val_loss: 1.0357 - val_acc: 0.6140 - 3s/epoch - 15ms/step
Epoch 9/20
225/225 - 3s - loss: 0.3168 - acc: 0.8798 - val_loss: 1.1400 - val_acc: 0.6280 - 3s/e

<keras.callbacks.History at 0x24d874c22e0>

In [246]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, None, 200)         4382600   
                                                                 
 cu_dnnlstm_10 (CuDNNLSTM)   (None, 200)               321600    
                                                                 
 dense_10 (Dense)            (None, 3)                 603       
                                                                 
Total params: 4,704,803
Trainable params: 4,704,803
Non-trainable params: 0
_________________________________________________________________


In [8]:
# End DEV #

In [7]:
X, y = load_sparse_matrices()
X_train, X_test, y_train, y_test = split_train_test(X, y)
del X, y

In [9]:
import tensorflow as tf
print(tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


/device:GPU:0
Num GPUs Available:  1
Num GPUs Available:  1


In [8]:
# Flatten array
# Limit Size for DEV
X_train_lstm = X_train.toarray()[:10000, :, None]
# y_train_lstm = to_categorical(y_train) # To make it 2d
# y_train_lstm = y_train_lstm[:10000, :]
y_train_lstm = y_train[:10000]
# del X_train
# del y_train

In [165]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))
print(tf.config.list_physical_devices())

2.7.0
Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [166]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

In [9]:
from sklearn.utils import compute_class_weight
classWeight = compute_class_weight('balanced', np.unique(y_train_lstm), y_train_lstm) 
classWeight = dict(enumerate(classWeight))



In [10]:
classWeight

{0: 2.440214738897023, 1: 0.5340168749332479, 2: 1.3935340022296545}

In [43]:
# DEV
#from keras.optimizers import adam
#opt = SGD(lr=0.01)

from keras.layers import Embedding, Dense, CuDNNLSTM, Dropout
import tensorflow as tf
opti = tf.keras.optimizers.Adam(0.1)

m = Sequential()
# m.add(Embedding(X_train_lstm.shape[1], 512)) # Input dim is X_train_lstm.shape[1], dim is the output dimensionality
m.add(CuDNNLSTM(32))
m.add(Dropout(0.1))
m.add(Dense(1, activation = "softmax"))
m.compile(optimizer = opti, loss =  "categorical_crossentropy", metrics = ["accuracy"])
history = m.fit(X_train, y_train, epochs=5, batch_size=64, verbose=2, class_weight = classWeight)

Epoch 1/5


ValueError: in user code:

    File "c:\DEV\Master Thesis\App\.thesis\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "c:\DEV\Master Thesis\App\.thesis\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\DEV\Master Thesis\App\.thesis\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "c:\DEV\Master Thesis\App\.thesis\lib\site-packages\keras\engine\training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "c:\DEV\Master Thesis\App\.thesis\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\DEV\Master Thesis\App\.thesis\lib\site-packages\keras\engine\input_spec.py", line 213, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" '

    ValueError: Exception encountered when calling layer "sequential_10" (type Sequential).
    
    Input 0 of layer "cu_dnnlstm_10" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 2050)
    
    Call arguments received:
      • inputs=<tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000206DECB8040>
      • training=True
      • mask=None


In [13]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, CuDNNLSTM, Dropout

def build_model(input_dim, output_dim, hidden_states, opt):
    model = Sequential()
    model.add(Embedding(input_dim, output_dim)) # Input dim is X_train_lstm.shape[1], dim is the output dimensionality
    model.add(CuDNNLSTM(hidden_states))
    model.add(Dropout(0.1))
    model.add(Dense(3, activation = "softmax"))
    model.compile(opt, "categorical_crossentropy", metrics = ["acc"])
    return model

In [16]:
dim = [512] #[256, 512, 1024]
hidden_states = [16] #[16, 32, 64]
optimi = ["Adam"] #["rmsprop", "SGD", "Adam"]

acc = []
val_acc = []
loss = []
val_loss = []

param_list = []

for d in dim:
    for state in hidden_states:
        for opt in optimi:
            # optimizer = tf.keras.optimizers.Adam(0.001)
            # optimizer.learning_rate.assign(0.01)
            model = build_model(X_train_lstm.shape[1], d, state, opt)
            history = model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=64, validation_split=0.2, verbose=2)
            acc.append(history.history['acc'])
            val_acc.append(history.history['val_acc'])
            loss.append(history.history['loss'])
            val_loss.append(history.history['val_loss'])
            param_list.append("Optimizer: " + opt + " - States: " + str(state) + " - Dimensions:" + str(d))


Epoch 1/5
125/125 - 12s - loss: 0.9262 - acc: 0.6177 - val_loss: 0.8930 - val_acc: 0.6345 - 12s/epoch - 98ms/step
Epoch 2/5
125/125 - 11s - loss: 0.9168 - acc: 0.6216 - val_loss: 0.8927 - val_acc: 0.6345 - 11s/epoch - 92ms/step
Epoch 3/5
125/125 - 11s - loss: 0.9158 - acc: 0.6216 - val_loss: 0.8937 - val_acc: 0.6345 - 11s/epoch - 91ms/step
Epoch 4/5
125/125 - 12s - loss: 0.9145 - acc: 0.6216 - val_loss: 0.8932 - val_acc: 0.6345 - 12s/epoch - 92ms/step
Epoch 5/5
125/125 - 11s - loss: 0.9156 - acc: 0.6216 - val_loss: 0.8928 - val_acc: 0.6345 - 11s/epoch - 91ms/step


In [44]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 512)         1049600   
                                                                 
 cu_dnnlstm_3 (CuDNNLSTM)    (None, 32)                69888     
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,119,521
Trainable params: 1,119,521
Non-trainable params: 0
_________________________________________________________________


In [54]:
# Save Model
from keras.models import load_model
history.model.save('./data/small_lstm_model.h5')

In [55]:
loaded_model = load_model('./data/small_lstm_model.h5')

In [57]:
print(loaded_model.history)

None


In [62]:
print(loaded_model.history)

None


In [42]:
print(history.history) # Loss and Accuracy

{'loss': [0.0, 0.0], 'acc': [0.6518188714981079, 0.6518188714981079]}
{'verbose': 0, 'epochs': 2, 'steps': 4489}


In [64]:
import pickle
with open('./data/train_history', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

In [65]:
with open('./data/train_history', 'rb') as file_pi:
    h = pickle.load(file_pi)

In [66]:
print(h)

{'loss': [0.0, 0.0], 'acc': [0.6518188714981079, 0.6518188714981079]}


In [32]:
import matplotlib.pyplot as plt

def plot_history(data_list, label_list, title, xlabel='Epochs', ylabel=None):
    ''' Plots a list of vectors.

    Parameters:
        data_list  : list of vectors containing the values to plot
        label_list : list of labels describing the data, one per vector
        title      : title of the plot
        ylabel     : label for the y axis
    '''
    epochs = range(1, len(data_list[0]) + 1)

    for data, label in zip(data_list, label_list):
        plt.plot(epochs, data, label=label)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()

    plt.show()


In [33]:
plot_history(data_list=val_loss,
             label_list=[[""]*len(param_list)], #param_list,
             title='Comparison of different recurrent layer types',
             ylabel='Loss')
plot_history(data_list=val_acc,
             label_list=[[""]*len(param_list)], #param_list,
             title='Comparison of different recurrent layer types',
             ylabel='Validation accuracy')

NameError: name 'val_loss' is not defined

# BERT