In [1]:
import tensorflow as tf
import tensorflow_text
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import re
import numpy as np
import tensorflow_hub as hub

In [2]:
model_path = "model.h5"
bert_model_path = "sentiment140_bert"
tokenizer_path = "tokenizer.json"

In [3]:
with open(tokenizer_path,'r') as f:
    data = json.load(f)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)


In [4]:
model = tf.keras.models.load_model(model_path)

In [5]:
bert_model = tf.saved_model.load(bert_model_path)

In [6]:
def predict(texts):
    max_length = 80
    trunc_type='post'
    pad_type='post'
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type, padding=pad_type)
    return model.predict(padded)

In [7]:
def bert_preprocess(text):
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    stripped = re.sub(combined_pat, '', text)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    return lower_case.strip()
v_preprocess_bert = np.vectorize(bert_preprocess)

In [14]:
tests = ["Just went back to school today, so happy!", "Ugh, just went back to school", "Why is work so dull", "This thing costs too much"]

In [15]:
predict(tests)

array([[0.80126375],
       [0.02017036],
       [0.00281411],
       [0.10058132]], dtype=float32)

In [16]:
x = v_preprocess_bert(np.array(tests))
tf.sigmoid(bert_model(tf.constant(x)))

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[0.9529567 ],
       [0.02763721],
       [0.01168621],
       [0.10532796]], dtype=float32)>