## 1. Loading and Initializing

### 1.1 Define pre-processing functions

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

# try with TF2 SavedModel
# The online downloading method does not work, use pre-downloaded module
# bert_module = hub.Module("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1")

bert_module = hub.KerasLayer(BERT_DIR, trainable=True)

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

def convert_sentence_to_features(sentence, tokenizer, max_seq_len=50):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=50):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

import random, copy
import numpy as np
def make_rand_mask(input_ids, input_mask, vocab_size, segment_id_vals=None):
    ''' 
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    returns
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    mask_word_ids: the id of words that are masked
    pure_ids: ids in number instead of one-hot (to generate weights per masked word)
    segment_id_vals: mark the masked word with segment id 1
    '''
    batch_size = len(input_ids)
    
    new_input_mask = copy.deepcopy(input_mask)
    mask_word_ids = np.zeros((batch_size, vocab_size))
    pure_ids = []
    segment_encodings = []
    for i in range(batch_size):
        total_word = sum(input_mask[i])
        mask_word = random.randint(0, total_word-1)
        
        pure_ids.append(input_ids[i][mask_word])
        assert new_input_mask[i][mask_word] == 1
        new_input_mask[i][mask_word] = 0
        mask_word_ids[i][input_ids[i][mask_word]] = 1.0
        
        # Make the masked word segment id 1
        assert segment_id_vals[i][mask_word] == 0
        segment_id_vals[i][mask_word] = 1
                
    return new_input_mask, tf.convert_to_tensor(mask_word_ids, dtype=tf.dtypes.float32), pure_ids, segment_id_vals

### 1.2 Classifier model

In [3]:
type(bert_module)

tensorflow_hub.keras_layer.KerasLayer

In [53]:
class SentimentBert(tf.keras.Model):
    def __init__(self, class_num, bert=bert_module, dropout=0.1):
        super(SentimentBert, self).__init__()
        self.bert = bert
        self.drop = tf.keras.layers.Dropout(rate=dropout)
        self.dense= tf.keras.layers.Dense(
            class_num,
            activation=tf.keras.activations.sigmoid,
            kernel_initializer='glorot_uniform',
            name='sentiment_classification')
        
    def call(self, inputs):
        # When passed in, all tensors are stacked in one, split it into a list
        inputs = tf.unstack(tf.cast(inputs, tf.dtypes.int32), axis=1)
        pooled, sequential = self.bert(inputs)
        x = self.drop(pooled)
        return self.dense(x)

### 1.2.1 Sanity test on creating and compiling the model

In [41]:
model = SentimentBert(1)

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(opt, loss=tf.keras.losses.BinaryCrossentropy())

## 2. Prepare data

In [5]:
# Get the sentiment score of each phrase and write out to a file
DATASET_DIR = "/home/aufish/Downloads/stanfordSentimentTreebank"

score_file = open(DATASET_DIR + "/sentiment_labels.txt", "r")
score_dict = dict()

for line in score_file.readlines():
    parts = line.split("|")
    if parts[0] == "phrase ids":
        # skip first header line
        continue
        
    phrase_id, score = parts[0], float(parts[1])
    score_dict[phrase_id] = score
    
score_file.close()

phrase_file = open(DATASET_DIR + "/dictionary.txt", "r")
phrase_score_file = open("./phrase_score.txt", "w")

for line in phrase_file.readlines():
    parts = line.split("|")
    
    score = score_dict[parts[1].strip()]
    
    phrase_score_file.write("{}|{}\n".format(parts[0], str(score)) )
    
phrase_file.close()
phrase_score_file.close()

print("Dataset size: {}".format(len(score_dict)))

Dataset size: 239232


## 3. Load into Dataset

In [28]:
import numpy as np
def parse_line(line):
    parts = line.split(b"|")
    phrase, score = parts[0].decode(), float(parts[1].decode())
    
    input_ids, input_mask, segment_ids = convert_sentence_to_features(phrase, tokenizer, max_seq_len=20)
        
    return ([input_ids, input_mask, segment_ids], score)

def create_dataset(filename = "./phrase_score.txt", data_size=239232, batch_size=10):
    dataset = tf.data.TextLineDataset([filename]) 

    dataset = dataset.map(lambda x: tf.numpy_function(parse_line, [x], [tf.int64, tf.double]))

    # dataset = dataset.shuffle(data_size, reshuffle_each_iteration=False)
    
    dataset = dataset.batch(batch_size)
    
    return dataset

### 3.2 Test what is loaded

In [29]:
dataset = create_dataset()

In [42]:
for (bert_input, target) in dataset.take(5):
    print(bert_input)
    print(target)
    bert_input = tf.cast(bert_input, tf.dtypes.int32)
    print(bert_input)
    model(bert_input)

tf.Tensor(
[[[  101   106   102     0     0     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]
  [    1     1     1     0     0     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]
  [    0     0     0     0     0     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]]

 [[  101   106   112   102     0     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]
  [    1     1     1     1     0     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]
  [    0     0     0     0     0     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]]

 [[  101   106   112   112   102     0     0     0     0     0     0
       0     0     0     0     0     0     0     0     0]
  [    1     1     1     1     1     0     0     0     0     0     0
       0     0     0     0 

## 4. Training

In [54]:
# Fit function has bug
model = SentimentBert(1)
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(opt, loss=tf.keras.losses.BinaryCrossentropy())
model.fit(x=dataset, epochs=1)

ValueError: in converted code:

    /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/engine/training_v2.py:677 map_fn
        batch_size=None)
    /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/engine/training.py:2410 _standardize_tensors
        exception_prefix='input')
    /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/engine/training_utils.py:529 standardize_input_data
        data = [standardize_single_array(x) for x in data]
    /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/engine/training_utils.py:529 <listcomp>
        data = [standardize_single_array(x) for x in data]
    /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/keras/engine/training_utils.py:451 standardize_single_array
        if (x.shape is not None and len(x.shape) == 1 and
    /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/framework/tensor_shape.py:822 __len__
        raise ValueError("Cannot take the length of shape with unknown rank.")

    ValueError: Cannot take the length of shape with unknown rank.


In [63]:
model = SentimentBert(1)
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
bce_loss= tf.keras.losses.MeanSquaredError()
loss_metric = tf.keras.metrics.Mean()
epochs = 1

for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # Iterate over the batches of the dataset.
    total_loss = 0
    for step, (bert_input, target) in enumerate(dataset):
        with tf.GradientTape() as tape:
          output = model(bert_input)
          # Compute reconstruction loss
          loss = bce_loss(target, output)
          loss += sum(model.losses)  # Add KLD regularization loss

        grads = tape.gradient(loss, model.trainable_weights)
        opt.apply_gradients(zip(grads, model.trainable_weights))

        loss_metric(loss)

        if step % 100 == 0:
          print('step %s: mean loss = %s' % (step, loss_metric.result()))

Start of epoch 0
step 0: mean loss = tf.Tensor(0.1469778, shape=(), dtype=float32)
step 100: mean loss = tf.Tensor(0.034767933, shape=(), dtype=float32)
step 200: mean loss = tf.Tensor(0.035866126, shape=(), dtype=float32)
step 300: mean loss = tf.Tensor(0.03909982, shape=(), dtype=float32)
step 400: mean loss = tf.Tensor(0.039109845, shape=(), dtype=float32)
step 500: mean loss = tf.Tensor(0.038421936, shape=(), dtype=float32)
step 600: mean loss = tf.Tensor(0.036577944, shape=(), dtype=float32)
step 700: mean loss = tf.Tensor(0.03744269, shape=(), dtype=float32)
step 800: mean loss = tf.Tensor(0.03675551, shape=(), dtype=float32)
step 900: mean loss = tf.Tensor(0.037878055, shape=(), dtype=float32)
step 1000: mean loss = tf.Tensor(0.03760385, shape=(), dtype=float32)
step 1100: mean loss = tf.Tensor(0.038826995, shape=(), dtype=float32)
step 1200: mean loss = tf.Tensor(0.038181778, shape=(), dtype=float32)
step 1300: mean loss = tf.Tensor(0.038149845, shape=(), dtype=float32)
step 14

step 11600: mean loss = tf.Tensor(0.03262478, shape=(), dtype=float32)
step 11700: mean loss = tf.Tensor(0.03244583, shape=(), dtype=float32)
step 11800: mean loss = tf.Tensor(0.032266498, shape=(), dtype=float32)
step 11900: mean loss = tf.Tensor(0.03209871, shape=(), dtype=float32)
step 12000: mean loss = tf.Tensor(0.031954087, shape=(), dtype=float32)
step 12100: mean loss = tf.Tensor(0.031828947, shape=(), dtype=float32)
step 12200: mean loss = tf.Tensor(0.031693105, shape=(), dtype=float32)
step 12300: mean loss = tf.Tensor(0.031556983, shape=(), dtype=float32)
step 12400: mean loss = tf.Tensor(0.03148336, shape=(), dtype=float32)
step 12500: mean loss = tf.Tensor(0.031433456, shape=(), dtype=float32)
step 12600: mean loss = tf.Tensor(0.03135305, shape=(), dtype=float32)
step 12700: mean loss = tf.Tensor(0.031209001, shape=(), dtype=float32)
step 12800: mean loss = tf.Tensor(0.031047657, shape=(), dtype=float32)
step 12900: mean loss = tf.Tensor(0.030930385, shape=(), dtype=float3

step 23100: mean loss = tf.Tensor(0.026289888, shape=(), dtype=float32)
step 23200: mean loss = tf.Tensor(0.026270686, shape=(), dtype=float32)
step 23300: mean loss = tf.Tensor(0.026252037, shape=(), dtype=float32)
step 23400: mean loss = tf.Tensor(0.026294768, shape=(), dtype=float32)
step 23500: mean loss = tf.Tensor(0.02629201, shape=(), dtype=float32)
step 23600: mean loss = tf.Tensor(0.026303582, shape=(), dtype=float32)
step 23700: mean loss = tf.Tensor(0.026354393, shape=(), dtype=float32)
step 23800: mean loss = tf.Tensor(0.026372401, shape=(), dtype=float32)
step 23900: mean loss = tf.Tensor(0.026368355, shape=(), dtype=float32)


In [None]:
model.save_weights("./bert_sentiment_analysis_v1")

## 5. Prediction

In [56]:
def predict(model, sentence):
    # Give a sentence and return the sentiment score of the sentence
    input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len=20)
    tensor_input = tf.stack([tf.constant(input_ids), tf.constant(input_mask), tf.constant(segment_ids)])
    tensor_input = tf.reshape(tensor_input, [1, 3, 20])
    
    return model(tensor_input)

In [69]:
predict(model, "shit")

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.5123083]], dtype=float32)>