## 1. Loading and Initializing

### 1.1 Define pre-processing functions

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

# try with TF2 SavedModel
# The online downloading method does not work, use pre-downloaded module
# bert_module = hub.Module("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1")

bert_module = hub.KerasLayer(BERT_DIR, trainable=True)

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
[ 101,  102, 1106, 1103,  119,  117,  112,  170, 1122,  146]# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

def convert_sentence_to_features(sentence, tokenizer, max_seq_len=50):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=50):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

import random, copy
import numpy as np
def make_rand_mask(input_ids, input_mask, vocab_size, segment_id_vals=None):
    ''' 
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    returns
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    mask_word_ids: the id of words that are masked
    pure_ids: ids in number instead of one-hot (to generate weights per masked word)
    segment_id_vals: mark the masked word with segment id 1
    '''
    batch_size = len(input_ids)
    
    new_input_mask = copy.deepcopy(input_mask)
    mask_word_ids = np.zeros((batch_size, vocab_size))
    pure_ids = []
    segment_encodings = []
    for i in range(batch_size):
        total_word = sum(input_mask[i])
        mask_word = random.randint(0, total_word-1)
        
        pure_ids.append(input_ids[i][mask_word])
        assert new_input_mask[i][mask_word] == 1
        new_input_mask[i][mask_word] = 0
        mask_word_ids[i][input_ids[i][mask_word]] = 1.0
        
        # Make the masked word segment id 1
        assert segment_id_vals[i][mask_word] == 0
        segment_id_vals[i][mask_word] = 1
                
    return new_input_mask, tf.convert_to_tensor(mask_word_ids, dtype=tf.dtypes.float32), pure_ids, segment_id_vals

### 1.2 Classifier model

In [3]:
type(bert_module)

tensorflow_hub.keras_layer.KerasLayer

In [10]:
class SentimentBert(tf.keras.Model):
    def __init__(self, class_num, bert=bert_module, dropout=0.1):
        super(SentimentBert, self).__init__()
        self.bert = bert
        self.drop1 = tf.keras.layers.Dropout(rate=dropout, trainable=True)
        self.dense1 = tf.keras.layers.Dense(
            256,
            activation=tf.keras.activations.relu,
            kernel_initializer='glorot_uniform',
            name='sentiment_classification_hidden',
            trainable=True)
        
        self.drop2 = tf.keras.layers.Dropout(rate=dropout, trainable=True)
        self.dense2 = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer='glorot_uniform',
            name='sentiment_classification',
            trainable=True)
        
    def call(self, inputs):
        # When passed in, all tensors are stacked in one, split it into a list
        inputs = tf.unstack(tf.cast(inputs, tf.dtypes.int32), axis=1)
        print(inputs)
        pooled, sequential = self.bert(inputs)
        x = self.drop1(pooled)
        x = self.dense1(x)
        x = self.drop2(x)
        return self.dense2(x)

### 1.2.1 Sanity test on creating and compiling the model

In [11]:
model = SentimentBert(1)

In [None]:
for weight in model.trainable_weights:
    print(weight.name)

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(opt, loss=tf.keras.losses.BinaryCrossentropy())

## 2. Prepare data

In [6]:
# Get the sentiment score of each phrase and write out to a file
DATASET_DIR = "/home/aufish/Downloads/stanfordSentimentTreebank"

score_file = open(DATASET_DIR + "/sentiment_labels.txt", "r")
score_dict = dict()

for line in score_file.readlines():
    parts = line.split("|")
    if parts[0] == "phrase ids":
        # skip first header line
        continue
        
    phrase_id, score = parts[0], float(parts[1])
    score_dict[phrase_id] = score
    
score_file.close()

phrase_file = open(DATASET_DIR + "/dictionary.txt", "r")
phrase_score_file = open("./phrase_score.txt", "w")

for line in phrase_file.readlines():
    parts = line.split("|")
    
    score = score_dict[parts[1].strip()]
    
    phrase_score_file.write("{}|{}\n".format(parts[0], str(score)) )
    
phrase_file.close()
phrase_score_file.close()

print("Dataset size: {}".format(len(score_dict)))

Dataset size: 239232


## 3. Load into Dataset

In [7]:
import numpy as np
def parse_line(line):
    parts = line.split(b"|")
    phrase, score = parts[0].decode(), float(parts[1].decode())
    
    input_ids, input_mask, segment_ids = convert_sentence_to_features(phrase, tokenizer, max_seq_len=20)
        
    return ([input_ids, input_mask, segment_ids], score)

def create_dataset(filename = "./phrase_score.txt", data_size=239232, batch_size=10):
    dataset = tf.data.TextLineDataset([filename]) 

    dataset = dataset.map(lambda x: tf.numpy_function(parse_line, [x], [tf.int64, tf.double]))

    # dataset = dataset.shuffle(data_size, reshuffle_each_iteration=False)
    
    dataset = dataset.batch(batch_size)
    
    return dataset

### 3.2 Test what is loaded

In [8]:
dataset = create_dataset(batch_size=20)

In [12]:
for (bert_input, target) in dataset.take(1):
#     print(bert_input)
#     print(target)
    bert_input = tf.cast(bert_input, tf.dtypes.int32)
#     print(bert_input)
    model(bert_input)

[<tf.Tensor: shape=(20, 20), dtype=int32, numpy=
array([[  101,   106,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,   106,   112,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,   106,   112,   112,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,   106,  2586,  2225,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,   106,   139, 11071, 13789,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,   106,   139, 11071, 13789,   106,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0

## 4. Training

In [1]:
# Fit function has bug
model = SentimentBert(1)
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(opt, loss=tf.keras.losses.BinaryCrossentropy())
model.fit(x=dataset, epochs=10)

NameError: name 'SentimentBert' is not defined

In [9]:
# After these trainings, the results are pretty usable
model = SentimentBert(1)
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
bce_loss= tf.keras.losses.MeanSquaredError()
loss_metric = tf.keras.metrics.Mean()
epochs = 10

dataset = create_dataset(batch_size=20)

In [10]:
for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # Iterate over the batches of the dataset.
    total_loss = 0
    for step, (bert_input, target) in enumerate(dataset):
        with tf.GradientTape() as tape:
          output = model(bert_input)
          # Compute reconstruction loss
          loss = bce_loss(target, output)
          loss += sum(model.losses)  # Add KLD regularization loss

        grads = tape.gradient(loss, model.trainable_weights)
        opt.apply_gradients(zip(grads, model.trainable_weights))

        loss_metric(loss)

        if step % 1000 == 0:
          print('step %s: mean loss = %s' % (step, loss_metric.result()))

Start of epoch 0
step 0: mean loss = tf.Tensor(0.15566286, shape=(), dtype=float32)
step 1000: mean loss = tf.Tensor(0.022228226, shape=(), dtype=float32)
step 2000: mean loss = tf.Tensor(0.020681065, shape=(), dtype=float32)
step 3000: mean loss = tf.Tensor(0.020217834, shape=(), dtype=float32)
step 4000: mean loss = tf.Tensor(0.018260634, shape=(), dtype=float32)
step 5000: mean loss = tf.Tensor(0.01684368, shape=(), dtype=float32)
step 6000: mean loss = tf.Tensor(0.015902212, shape=(), dtype=float32)
step 7000: mean loss = tf.Tensor(0.015287982, shape=(), dtype=float32)
step 8000: mean loss = tf.Tensor(0.01478511, shape=(), dtype=float32)
step 9000: mean loss = tf.Tensor(0.014164187, shape=(), dtype=float32)
step 10000: mean loss = tf.Tensor(0.01379407, shape=(), dtype=float32)
step 11000: mean loss = tf.Tensor(0.013345432, shape=(), dtype=float32)
Start of epoch 1
step 0: mean loss = tf.Tensor(0.013089064, shape=(), dtype=float32)
step 1000: mean loss = tf.Tensor(0.0129173165, shap

step 5000: mean loss = tf.Tensor(0.0060133226, shape=(), dtype=float32)
step 6000: mean loss = tf.Tensor(0.0059797843, shape=(), dtype=float32)
step 7000: mean loss = tf.Tensor(0.0059487573, shape=(), dtype=float32)
step 8000: mean loss = tf.Tensor(0.0059175547, shape=(), dtype=float32)
step 9000: mean loss = tf.Tensor(0.0058818385, shape=(), dtype=float32)
step 10000: mean loss = tf.Tensor(0.0058523146, shape=(), dtype=float32)
step 11000: mean loss = tf.Tensor(0.0058174212, shape=(), dtype=float32)


In [31]:
model.save_weights("./bert_sentiment_analysis_v1")

## 5. Prediction

In [37]:
def single_sentence_predict(model, sentence):
    # Give a sentence and return the sentiment score of the sentence
    input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len=20)
    tensor_input = tf.stack([tf.constant(input_ids), tf.constant(input_mask), tf.constant(segment_ids)])
    tensor_input = tf.reshape(tensor_input, [1, 3, 20])
    
    return tf.keras.backend.get_value(
        model(tensor_input) )[0, 0]

In [46]:
sentence = "it is a good movie, i have to admit"
score = single_sentence_predict(model, sentence)
print("Input: " + sentence)
print("Positivity score: {}".format(score))

Input: it is a good movie, i have to admit
Positivity score: 0.7689685821533203


In [47]:
sentence = "it's pure trash"
score = single_sentence_predict(model, sentence)
print("Input: " + sentence)
print("Positivity score: {}".format(score))

Input: it's pure trash
Positivity score: 0.05330285429954529


In [48]:
sentence = "although boring at first, it is still a genius work"
score = single_sentence_predict(model, sentence)
print("Input: " + sentence)
print("Positivity score: {}".format(score))

Input: although boring at first, it is still a genius work
Positivity score: 0.6611613035202026


In [49]:
sentence = "no one can deny how brilliant it is"
score = single_sentence_predict(model, sentence)
print("Input: " + sentence)
print("Positivity score: {}".format(score))

Input: no one can deny how brilliant it is
Positivity score: 0.9334973096847534
