In [1]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import emoji

In [2]:
def clean_data(df):

    df['clean'] = df['raw_data'].apply(lambda x: str(x).replace('\n', ' '))
    df['clean'] = df['clean'].str.replace(pat='(@\w+)', repl='', regex=True)
    df['clean'] = df['clean'].replace(to_replace=' +', value=' ', regex=True)
    df['clean'] = df['clean'].astype(str).str.replace(r"""[#*+\\\(\)_!:\-\[\]<>']""", '', regex=True)
    df['clean'] = df['clean'].astype(str).str.replace(r"""(\.{2,})""", '.', regex=True)
    df['clean'] = df['clean'].astype(str).str.replace(r"""[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?""", '<<<URL>>>', regex=True)
    df['clean'] = df['clean'].astype(str).str.replace(r"""\$(\d+\.?,?)+""", '<<<AMOUNT>>>', regex=True)
    df['clean'] = df['clean'].astype(str).str.replace(r"""(\d+%)""", '<<<PERCENTAGE>>>', regex=True)
    df['clean'] = df['clean'].apply(lambda x: emoji.demojize(x, delimiters=(" ___", "___ ")))
    df['clean'] = df['clean'].astype(str).str.replace(r'[\',\?\(\)!@#$\+-\.\/\^*~|\{\}=\d/]+', '', regex=True)
    df['word_count'] = df['clean'].apply(lambda x: len(str(x).split()))
    # df = df[df['word_count'] > 2]
    df['clean'] = df['clean'].str.lower()    
    
    return df

In [3]:
def get_vector_representation_of_a_word(word: str):
    try:
        vector_representation_of_a_word = word2vec_model.wv.get_vector(word).reshape(100, -1)
    except Exception as E:
        print(f'Vector representation not found for "{word}"')
        vector_representation_of_a_word = np.zeros(100).reshape(100,-1)
    finally:
        return vector_representation_of_a_word
        

def get_vector_representation_of_a_sentence(sentence):
    if isinstance(sentence, list):
        sentence = sentence
    else:
        sentence = sentence.split()
        
    vector_representation_of_a_sentence = []
    for word in sentence:
        vector_representation_of_a_sentence.append(get_vector_representation_of_a_word(word))
    
    return vector_representation_of_a_sentence

In [4]:
def get_prediction_labels(predictions):
    labels = ['negative', 'neutral', 'other', 'positive']
    
    predicted_labels = []
    for pred in predictions:
        predicted_labels.append(labels[np.argmax(pred)])
        
    return predicted_labels

In [5]:
def predict_with_lstm(raw_data):
    if isinstance(raw_data, list):
        df = pd.DataFrame(data=raw_data, columns=['raw_data'])
    else:
        raise BaseException("Wrong Input: please provide list of texts.")

    df = clean_data(df)
    df['split'] = df['clean'].str.split()
    df['vec_rep'] = df['split'].apply(
        lambda x: get_vector_representation_of_a_sentence(x)
    )
    X = df['vec_rep']
    sentence_padded = sequence.pad_sequences(X, maxlen=150)
    sentence_padded = sentence_padded[:, :, :, 0]
    
    predictions = model.predict(sentence_padded)
    predictions_label = get_prediction_labels(predictions=predictions)
    
    df['prediction'] = predictions_label
    
    return df

### Usage

In [6]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence
from gensim.models import Word2Vec

In [7]:
global model
model = load_model('model-assets/model-3-10-epochs.bin')

2021-11-18 17:25:45.394309: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
global word2vec_model
word2vec_model = Word2Vec.load(f'model-assets/word-to-vec-model-5000-epochs.bin')

In [9]:
sentence = [
    "this coin is good", 
    "What is this coin?", 
    "shiba performed poor recently"
]

df = predict_with_lstm(sentence)
df

Unnamed: 0,raw_data,clean,word_count,split,vec_rep,prediction
0,this coin is good,this coin is good,4,"[this, coin, is, good]","[[[-1.7857865], [5.0768943], [4.134419], [3.38...",positive
1,What is this coin?,what is this coin,4,"[what, is, this, coin]","[[[0.14095268], [1.760383], [-2.4575827], [3.4...",neutral
2,shiba performed poor recently,shiba performed poor recently,4,"[shiba, performed, poor, recently]","[[[-2.9768016], [-5.9770007], [5.1817427], [-0...",negative
