# Procedure
a. Import the three files (model.h5, model.w2v, tokenizer.pkl) from the mail in the data tab (right panel) and name it as "input-models" when uploading in kaggle.

b. Post that, run all code blocks.

c. predict() function will evaluate the sentiment of each text as you enter.


In [None]:
import time
import pickle
import numpy as np
import keras
import tensorflow as tf

print(f'using keras version... {keras.__version__} & tf version... {tf.__version__}')

In [None]:
%%time
# 1: loading tokenizer
from keras.preprocessing.text import Tokenizer
TOKENIZER_MODEL = "../input/input-models/tokenizer.pkl"
tokenizer = pickle.load(open(TOKENIZER_MODEL, "rb"))  # todo: might want to fix protocol

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
# 2: loading w2v model
from gensim.models import KeyedVectors
from keras.layers import Embedding
w2v_model = KeyedVectors.load("../input/input-models/model.w2v", mmap='r')
W2V_SIZE = 300

embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], trainable=False)

In [None]:
# 3: loading keras model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras import utils

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.load_weights('../input/input-models/model.h5')
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)


def process_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [None]:
# SENTIMENT
from keras.preprocessing.sequence import pad_sequences

POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)
SEQUENCE_LENGTH = 300

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE
        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

def predict(text, include_neutral=True):
    start_at = time.time()
    text = process_text(text)
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [None]:
predict(text="I love the music", include_neutral=True)

In [None]:
predict(text="I hate the rain", include_neutral=True)

In [None]:
predict(text="Asco 2020 sees multiple myeloma responses deepen", include_neutral=True)