In [1]:
import gensim
import os
import time
import random
import re
import pandas as pd
import numpy as np
import multiprocessing
import spacy
import keras
import nltk
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

In [2]:
# nltk.download()

In [2]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
# Set seed to get the same output everytime
def seed_everything(SEED=13):
    np.random.seed(SEED)
    random.seed(SEED)
    tf.random.set_seed(SEED)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
    os.environ['PYTHONHASHSEED'] = str(SEED)

seed_everything()

In [7]:
cores = multiprocessing.cpu_count()
cores

12

In [8]:
# fix random seed for reproducibility
np.random.seed(7)

In [9]:
os.listdir(os.path.join(os.getcwd(), 'data-labeled'))

['CryptoRobinhooders_chat_data_clean__.xlsx',
 'Satoshi_club_chat_data_clean__.xlsx',
 'combined-super-clean-data.xlsx',
 'telegram_data_8th_NOV.csv',
 'telegram-clean-data.xlsx',
 '.ipynb_checkpoints',
 'desktop.ini']

In [10]:
df = pd.read_excel(r'data-labeled/telegram-clean-data.xlsx')
df['split'] = df['clean'].str.split()

In [11]:
df.tail(2)

Unnamed: 0,raw_data,clean,label,split
9314,Most of users are still not aware with Blockch...,most of users are still not aware with blockch...,other,"[most, of, users, are, still, not, aware, with..."
9315,What are the attractive features in your proje...,what are the attractive features in your proje...,other,"[what, are, the, attractive, features, in, you..."


In [12]:
os.listdir(os.path.join(os.getcwd(), 'model-assets'))

['feature.pkl',
 'model.pkl',
 'word-to-vec-model-1000-epochs.bin',
 'word-to-vec-model-5000-epochs.bin',
 'word-to-vec-model-2000-epochs.bin',
 'desktop.ini']

In [13]:
# Loading the saved gensim.models.word2vec.Word2Vec model

global word2vec_model
word2vec_model = Word2Vec.load(f'model-assets/word-to-vec-model-5000-epochs.bin')

In [14]:
def get_vector_representation_of_a_word(word: str):
    try:
        vector_representation_of_a_word = word2vec_model.wv.get_vector(word).reshape(100, -1)
    except Exception as E:
        print(f'Vector representation not found for "{word}"')
        vector_representation_of_a_word = np.zeros(100).reshape(100,-1)
    finally:
        return vector_representation_of_a_word
        

def get_vector_representation_of_a_sentence(sentence):
    if isinstance(sentence, list):
        sentence = sentence
    else:
        sentence = sentence.split()
        
    vector_representation_of_a_sentence = []
    for word in sentence:
        vector_representation_of_a_sentence.append(get_vector_representation_of_a_word(word))
    
    return vector_representation_of_a_sentence

In [15]:
df['sentence_in_vector_rep'] = df['split'].apply(lambda x: get_vector_representation_of_a_sentence(x))

In [16]:
df.tail(2)

Unnamed: 0,raw_data,clean,label,split,sentence_in_vector_rep
9314,Most of users are still not aware with Blockch...,most of users are still not aware with blockch...,other,"[most, of, users, are, still, not, aware, with...","[[[-5.164155], [2.520065], [-2.1611626], [1.95..."
9315,What are the attractive features in your proje...,what are the attractive features in your proje...,other,"[what, are, the, attractive, features, in, you...","[[[0.14095268], [1.760383], [-2.4575827], [3.4..."


In [17]:
X, y = df['sentence_in_vector_rep'], df['label']
y = pd.get_dummies(y).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
print([len(x) for x in df['split'] if len(x) > 100])

[224, 111, 165, 162, 101, 233, 171, 162, 171, 129, 121, 120, 115, 123, 115, 105, 162, 161, 160, 115, 192, 128, 115, 132, 197, 108, 124, 101, 187, 103, 116, 440, 220, 198, 183, 171, 183, 119, 127, 131, 171, 171, 143, 165, 152, 491, 126, 104, 150, 150, 126, 123, 169, 194, 194, 193, 108, 115, 119, 131, 108, 124, 124, 157, 154, 154, 157, 115, 135, 108, 192, 134, 123]


In [19]:
pd.get_dummies(df['label']).columns

Index(['negative', 'neutral', 'other', 'positive'], dtype='object')

In [23]:
print([len(x) for x in list(X_train)[:20]])

[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]


In [21]:
len(word2vec_model.wv.index_to_key)

10237

In [22]:
# truncate and pad input sequences

max_vector_length = 100
X_train = sequence.pad_sequences(X_train, maxlen=max_vector_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_vector_length)

In [24]:
embed_dim = 128
lstm_out = 196
max_features = 1300

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1]))
model.add(LSTM(lstm_out))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

2021-11-17 10:36:51.409885: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          166400    
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 421,594
Trainable params: 421,594
Non-trainable params: 0
_________________________________________________________________
None


In [51]:
batch_size = 32
model.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose = 2, validation_split=0.2, workers=cores-1, use_multiprocessing=True)

Epoch 1/5


ValueError: in user code:

    File "/opt/miniconda3/envs/tele/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/opt/miniconda3/envs/tele/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/miniconda3/envs/tele/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/opt/miniconda3/envs/tele/lib/python3.9/site-packages/keras/engine/training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "/opt/miniconda3/envs/tele/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/miniconda3/envs/tele/lib/python3.9/site-packages/keras/engine/input_spec.py", line 213, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" '

    ValueError: Exception encountered when calling layer "sequential_1" (type Sequential).
    
    Input 0 of layer "simple_rnn" is incompatible with the layer: expected ndim=3, found ndim=5. Full shape received: (32, 100, 100, 1, 50)
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(32, 100, 100, 1), dtype=int32)
      • training=True
      • mask=None
