In [1]:
import gensim
import os
import time
import random
import re
import pandas as pd
import numpy as np
import multiprocessing
import spacy
import keras
import nltk
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

In [2]:
# nltk.download()

In [2]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
# Set seed to get the same output everytime
def seed_everything(SEED=13):
    np.random.seed(SEED)
    random.seed(SEED)
    tf.random.set_seed(SEED)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
    os.environ['PYTHONHASHSEED'] = str(SEED)

seed_everything()

In [7]:
cores = multiprocessing.cpu_count()
cores

12

In [8]:
# fix random seed for reproducibility
np.random.seed(7)

In [9]:
os.listdir(os.path.join(os.getcwd(), 'data-labeled'))

['CryptoRobinhooders_chat_data_clean__.xlsx',
 'Satoshi_club_chat_data_clean__.xlsx',
 'combined-super-clean-data.xlsx',
 'telegram_data_8th_NOV.csv',
 'telegram-clean-data.xlsx',
 '.ipynb_checkpoints',
 'desktop.ini']

In [178]:
df = pd.read_excel(r'data-labeled/telegram-clean-data.xlsx')
df['split'] = df['clean'].str.split()

In [86]:
df.tail(2)

Unnamed: 0,raw_data,clean,label,split
9314,Most of users are still not aware with Blockch...,most of users are still not aware with blockch...,other,"[most, of, users, are, still, not, aware, with..."
9315,What are the attractive features in your proje...,what are the attractive features in your proje...,other,"[what, are, the, attractive, features, in, you..."


In [87]:
os.listdir(os.path.join(os.getcwd(), 'model-assets'))

['feature.pkl',
 'model.pkl',
 'word-to-vec-model-1000-epochs.bin',
 'word-to-vec-model-5000-epochs.bin',
 'word-to-vec-model-2000-epochs.bin',
 'desktop.ini']

In [88]:
# Loading the saved gensim.models.word2vec.Word2Vec model

global word2vec_model
word2vec_model = Word2Vec.load(f'model-assets/word-to-vec-model-5000-epochs.bin')

In [89]:
def get_vector_representation_of_a_word(word: str):
    try:
        vector_representation_of_a_word = word2vec_model.wv.get_vector(word).reshape(100, -1)
    except Exception as E:
        print(f'Vector representation not found for "{word}"')
        vector_representation_of_a_word = np.zeros(100).reshape(100,-1)
    finally:
        return vector_representation_of_a_word
        

def get_vector_representation_of_a_sentence(sentence):
    if isinstance(sentence, list):
        sentence = sentence
    else:
        sentence = sentence.split()
        
    vector_representation_of_a_sentence = []
    for word in sentence:
        vector_representation_of_a_sentence.append(get_vector_representation_of_a_word(word))
    
    return vector_representation_of_a_sentence

In [179]:
df['sentence_in_vector_rep'] = df['split'].apply(lambda x: get_vector_representation_of_a_sentence(x))

In [91]:
df.tail(2)

Unnamed: 0,raw_data,clean,label,split,sentence_in_vector_rep
9314,Most of users are still not aware with Blockch...,most of users are still not aware with blockch...,other,"[most, of, users, are, still, not, aware, with...","[[[-5.164155], [2.520065], [-2.1611626], [1.95..."
9315,What are the attractive features in your proje...,what are the attractive features in your proje...,other,"[what, are, the, attractive, features, in, you...","[[[0.14095268], [1.760383], [-2.4575827], [3.4..."


In [180]:
X, y = df['sentence_in_vector_rep'], df['label']
y = pd.get_dummies(y).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [153]:
print([len(x) for x in df['split'] if len(x) > 100])

[224, 111, 165, 162, 101, 233, 171, 162, 171, 129, 121, 120, 115, 123, 115, 105, 162, 161, 160, 115, 192, 128, 115, 132, 197, 108, 124, 101, 187, 103, 116, 440, 220, 198, 183, 171, 183, 119, 127, 131, 171, 171, 143, 165, 152, 491, 126, 104, 150, 150, 126, 123, 169, 194, 194, 193, 108, 115, 119, 131, 108, 124, 124, 157, 154, 154, 157, 115, 135, 108, 192, 134, 123]


In [154]:
pd.get_dummies(df['label']).columns

Index(['negative', 'neutral', 'other', 'positive'], dtype='object')

In [155]:
print([len(x) for x in list(X_train)[:20]])

[15, 40, 4, 21, 7, 25, 7, 6, 6, 3, 24, 4, 10, 7, 4, 13, 5, 44, 5, 36]


In [156]:
len(word2vec_model.wv.index_to_key)

10237

In [181]:
# truncate and pad input sequences

max_vector_length = 150
X_train = sequence.pad_sequences(X_train, maxlen=max_vector_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_vector_length)

In [182]:
X_train = X_train[:, :, :, 0]
X_test = X_test[:, :, :, 0]

In [183]:
X_train.shape, X_test.shape

((6241, 150, 100), (3075, 150, 100))

In [184]:
from keras.layers import Input

In [188]:
from keras import optimizers

In [192]:
from tensorflow.keras import optimizers

In [196]:
seq_lenght = 150
vector_dim = 100  # Embedding Dimension
lstm_hidden_size_1 = 32
lstm_hidden_size_2 = 16

inputs_seq = Input(shape=(seq_lenght, vector_dim))
lstm_out_1 = Bidirectional(LSTM(lstm_hidden_size_1, return_sequences=True))(inputs_seq)
# dense_out_1 = Dense(num_hidden_rep_size_1, activation='softmax')(lstm_out_1)
lstm_out_2 = Bidirectional(LSTM(lstm_hidden_size_2, return_sequences=False))(lstm_out_1)
dense_1 = Dense(4,activation='softmax')(lstm_out_2)

model = Model(inputs_seq, dense_1)
# sgd = optimizers.SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 150, 100)]        0         
                                                                 
 bidirectional_28 (Bidirecti  (None, 150, 64)          34048     
 onal)                                                           
                                                                 
 bidirectional_29 (Bidirecti  (None, 32)               10368     
 onal)                                                           
                                                                 
 dense_33 (Dense)            (None, 4)                 132       
                                                                 
Total params: 44,548
Trainable params: 44,548
Non-trainable params: 0
_________________________________________________________________
None


In [197]:
batch_size = 32
model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=2, validation_data=[X_test, y_test], workers=cores-1, use_multiprocessing=True)

Epoch 1/10
196/196 - 22s - loss: 1.0523 - accuracy: 0.5328 - val_loss: 0.9166 - val_accuracy: 0.6270 - 22s/epoch - 113ms/step
Epoch 2/10
196/196 - 16s - loss: 0.8158 - accuracy: 0.6722 - val_loss: 0.8610 - val_accuracy: 0.6472 - 16s/epoch - 81ms/step
Epoch 3/10
196/196 - 16s - loss: 0.6815 - accuracy: 0.7436 - val_loss: 0.8668 - val_accuracy: 0.6449 - 16s/epoch - 84ms/step
Epoch 4/10
196/196 - 17s - loss: 0.5671 - accuracy: 0.7920 - val_loss: 0.8868 - val_accuracy: 0.6481 - 17s/epoch - 87ms/step
Epoch 5/10
196/196 - 16s - loss: 0.4689 - accuracy: 0.8351 - val_loss: 0.9792 - val_accuracy: 0.6449 - 16s/epoch - 83ms/step
Epoch 6/10
196/196 - 16s - loss: 0.4112 - accuracy: 0.8521 - val_loss: 0.9977 - val_accuracy: 0.6436 - 16s/epoch - 80ms/step
Epoch 7/10
196/196 - 15s - loss: 0.3561 - accuracy: 0.8744 - val_loss: 1.0461 - val_accuracy: 0.6387 - 15s/epoch - 79ms/step
Epoch 8/10
196/196 - 15s - loss: 0.3229 - accuracy: 0.8862 - val_loss: 1.1320 - val_accuracy: 0.6455 - 15s/epoch - 79ms/step

<keras.callbacks.History at 0x7fec999b7700>

In [165]:
X_test[0].reshape(-1,120,100).shape

(1, 120, 100)

In [169]:
model.predict([X_test[0].reshape(-1,120,100)])

array([[0.0105889 , 0.94503766, 0.0160985 , 0.02827492]], dtype=float32)

In [167]:
model.save('model-assets/model-1-10-epochs.bin')



INFO:tensorflow:Assets written to: model-assets/model-1-10-epochs.bin/assets


INFO:tensorflow:Assets written to: model-assets/model-1-10-epochs.bin/assets


In [168]:
new = keras.models.load_model('model-assets/model-1-10-epochs.bin')