## BERT pre-trained model

In [1]:
!pip install keras-bert
# !wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !unzip -o uncased_L-12_H-768_A-12.zip

[33mYou are using pip version 19.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Load the pre-trained model

In [2]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [25]:
import numpy as np
import keras
from keras_bert.backend import keras
from keras_bert import Tokenizer
from keras_bert import get_base_dict, get_model, gen_batch_inputs, get_custom_objects
from keras_bert import load_trained_model_from_checkpoint

seq_len=384 #512-6 #384-12
batch_size = 6

model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=seq_len, 
    output_layer_num=12,
)

# Loading takes time...

In [26]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 384)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 384)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 384, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 384, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

## Generate data

In [28]:
import codecs
import unicodedata

# clean vocab
token_dict = {} #get_base_dict()

with codecs.open(vocab_path, 'rb', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

decoder_dict = {v:k for k, v in token_dict.items()}        
token_list = list(token_dict.keys())  # Used for selecting a random word
tokenizer = Tokenizer(token_dict)

print(len(token_dict))
print(len(decoder_dict))

30522
30522


In [5]:
token_dict['[PAD]']

0

In [14]:
# a = 'Switching to a more memory efficient optimizer can reduce memory usage, but can also affect the results. We have not experimented with other optimizers for fine-tuning'
# print(tokenizer.tokenize(a)[1:-1]) 

['switching', 'to', 'a', 'more', 'memory', 'efficient', 'opt', '##imi', '##zer', 'can', 'reduce', 'memory', 'usage', ',', 'but', 'can', 'also', 'affect', 'the', 'results', '.', 'we', 'have', 'not', 'experimented', 'with', 'other', 'opt', '##imi', '##zers', 'for', 'fine', '-', 'tuning']


In [29]:
# load dataset # Generate batches
import gzip  

dataset = '../../data/reviews_cleaned.json.gz'

def create_review_pairs():
#     aa=0
    with gzip.open(dataset, 'rb') as f:
        batch_i=0
        review_pairs= []
        
        for line in f:
            data = eval(line)
            text = tokenizer.tokenize(data['review'])[1:-1]
            summary = tokenizer.tokenize(data['summary'])[1:-1]
            review_pairs.append([summary, text])
            batch_i+=1
            
            if batch_i < batch_size:
                continue
            else:                            
                yield {
                  "review_pair": review_pairs
                }

                review_pairs= []
                batch_i=0

#             aa+=1
#             if aa >3:
#                 break


# for pairs in create_review_pairs():
#     print(pairs['review_pair'])
#     print('---'*10)
        

In [30]:
# def batch_inputs(sentence_pairs,
#                     token_dict,
#                     token_list,
#                     seq_len=seq_len,
#                     mask_rate=0.5,
#                     train=True):

#     """Generate a batch of inputs and outputs for training.
#     :param sentence_pairs: A list of pairs containing lists of tokens.
#     :param token_dict: The dictionary containing special tokens.
#     :param token_list: A list containing all tokens.
#     :param seq_len: Length of the sequence.
#     :param mask_rate: The rate of choosing a token for prediction.
#     :return: All the inputs and outputs.
#     """
#     TOKEN_PAD = ''  # Token for padding
#     TOKEN_UNK = '[UNK]'  # Token for unknown words
#     TOKEN_CLS = '[CLS]'  # Token for classification
#     TOKEN_SEP = '[SEP]'  # Token for separation
#     TOKEN_MASK = '[MASK]'  # Token for masking

#     batch_size = len(sentence_pairs)
#     base_dict = get_base_dict()
#     unknown_index = token_dict[TOKEN_UNK]
#     # Generate sentence swapping mapping
#     nsp_outputs = np.zeros((batch_size,))
#     mapping = {}

#     # Generate MLM
#     token_inputs, segment_inputs, masked_inputs = [], [], []
#     mlm_outputs = []
#     for i in range(batch_size):
#         first, second = sentence_pairs[i][0], sentence_pairs[mapping.get(i, i)][1]
#         segment_inputs.append(([0] * (len(first) + 2) + [1] * (seq_len - (len(first) + 2)))[:seq_len])
#         tokens = [TOKEN_CLS] + first + [TOKEN_SEP] + second + [TOKEN_SEP]
#         tokens = tokens[:seq_len]
#         tokens += [TOKEN_PAD] * (seq_len - len(tokens))
        
#         token_input, masked_input, mlm_output = [], [], []
#         has_mask = False
        
#         for token in tokens:
#             mlm_output.append(token_dict.get(token, unknown_index))
            
#             if has_mask: # mask after 'SEP'                
#                 if token == TOKEN_SEP or token == TOKEN_PAD:
#                     masked_input.append(0)    
#                     token_input.append(token_dict.get(token, unknown_index))  
#                     has_mask = False  
#                 elif train:
#                     if np.random.random() < mask_rate:
#                         masked_input.append(1)
#                         token_input.append(token_dict[TOKEN_MASK])
#                     else:
#                         masked_input.append(0)
#                         token_input.append(token_dict[TOKEN_MASK])
#                 else:
#                     masked_input.append(1)
#                     token_input.append(token_dict[TOKEN_MASK])
                    
#             else: 
#                 if token == TOKEN_SEP:
#                     has_mask = True
                    
#                 masked_input.append(0)    
#                 token_input.append(token_dict.get(token, unknown_index))    
                
#         token_inputs.append(token_input)
#         masked_inputs.append(masked_input)
#         mlm_outputs.append(mlm_output)
#     inputs = [np.asarray(x) for x in [token_inputs, segment_inputs, masked_inputs]]
#     outputs = [np.asarray(np.expand_dims(x, axis=-1)) for x in [mlm_outputs, nsp_outputs]]
#     return inputs, outputs

# def _generator(train=True):
#     while True:
#         for data in create_review_pairs():
#             review_pairs = data['review_pair']

#             yield batch_inputs(
#                 review_pairs,
#                 token_dict,
#                 token_list,
#                 seq_len=seq_len,
#                 mask_rate= 0.5, # only used when training
#                 train=train
#             )

False

In [30]:
def batch_inputs(sentence_pairs,
                    token_dict,
                    token_list,
                    seq_len=seq_len,
                    mask_rate=0.5,
                    train=True):

    """Generate a batch of inputs and outputs for training.
    :param sentence_pairs: A list of pairs containing lists of tokens.
    :param token_dict: The dictionary containing special tokens.
    :param token_list: A list containing all tokens.
    :param seq_len: Length of the sequence.
    :param mask_rate: The rate of choosing a token for prediction.
    :return: All the inputs and outputs.
    """
    TOKEN_PAD = ''  # Token for padding
    TOKEN_UNK = '[UNK]'  # Token for unknown words
    TOKEN_CLS = '[CLS]'  # Token for classification
    TOKEN_SEP = '[SEP]'  # Token for separation
    TOKEN_MASK = '[MASK]'  # Token for masking

    batch_size = len(sentence_pairs)
    base_dict = get_base_dict()
    unknown_index = token_dict[TOKEN_UNK]
    # Generate sentence swapping mapping
    nsp_outputs = np.zeros((batch_size,))
    mapping = {}

    # Generate MLM
    token_inputs, segment_inputs, masked_inputs = [], [], []
    mlm_outputs = []
    for i in range(batch_size):
        first, second = sentence_pairs[i][0], sentence_pairs[mapping.get(i, i)][1]
        segment_inputs.append(([0] * (len(first) + 2) + [1] * (seq_len - (len(first) + 2)))[:seq_len])
        tokens = [TOKEN_CLS] + first + [TOKEN_SEP] + second + [TOKEN_SEP]
        tokens = tokens[:seq_len]
        tokens += [TOKEN_PAD] * (seq_len - len(tokens))
        
        token_input, masked_input, mlm_output = [], [], []
        has_mask = True
        
        for token in tokens:
            mlm_output.append(token_dict.get(token, unknown_index))
            
            if has_mask and token != TOKEN_SEP: # mask before 'SEP'                
                if token in get_base_dict() or token == '[PAD]':
                    masked_input.append(0)    
                    token_input.append(token_dict.get(token, unknown_index))  
 
                elif train:
                    if np.random.random() < mask_rate:
                        masked_input.append(1)
                        token_input.append(token_dict[TOKEN_MASK])
                    else:
                        masked_input.append(0)
                        token_input.append(token_dict[TOKEN_MASK])
                else:
                    masked_input.append(1)
                    token_input.append(token_dict[TOKEN_MASK])
                    
            else: 
                has_mask = False
                    
                masked_input.append(0)    
                token_input.append(token_dict.get(token, unknown_index))    
                
        token_inputs.append(token_input)
        masked_inputs.append(masked_input)
        mlm_outputs.append(mlm_output)
    inputs = [np.asarray(x) for x in [token_inputs, segment_inputs, masked_inputs]]
    outputs = [np.asarray(np.expand_dims(x, axis=-1)) for x in [mlm_outputs, nsp_outputs]]
    return inputs, outputs

def _generator(train=True):
    while True:
        for data in create_review_pairs():
            review_pairs = data['review_pair']

            yield batch_inputs(
                review_pairs,
                token_dict,
                token_list,
                seq_len=seq_len,
                mask_rate= 0.3, # only used when training
                train=train
            )

## Train

In [31]:
model.fit_generator(
    generator=_generator(train=True),
    steps_per_epoch=10,
    epochs=2,
    validation_data=_generator(),
    validation_steps=1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    ], #
)

Epoch 1/2


ResourceExhaustedError: OOM when allocating tensor with shape[72,384,384] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node Encoder-10-MultiHeadSelfAttention_4/Encoder-10-MultiHeadSelfAttention-Attention/mul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node loss_4/MLM_loss/Mean_3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [16]:
# Save model
import os
import numpy as np

model_path = 'fine_tuned_model/keras_bert_%f.h5' % np.random.random()
model.save(model_path)

In [None]:
# # load
# latest = tf.train.latest_checkpoint(checkpoint_retrain_dir)
# print(latest)
# model_new = create_model()
# model_new.load_weights(latest)

## Summary

In [20]:
# Predict
for inputs, outputs in _generator(train=False):
    predicts = model.predict(inputs)
    outputs = list(map(lambda x: np.squeeze(x, axis=-1), outputs))
    predicts = list(map(lambda x: np.argmax(x, axis=-1), predicts))
    batch_size, seq_len = inputs[-1].shape
    
    for i in range(batch_size):

        in_pp = np.where(inputs[0][i] == 102)[0][1] # index of end 'SEP'
        out_pp = np.where(inputs[-1][i] == 1)[0] # index of masked

        review_text = ' '.join([decoder_dict[inputs[0][i][j]] for j in range(in_pp)])
        gold_summary = ' '.join([decoder_dict[outputs[0][i][j]] for j in out_pp]) # print masked
        pred_summary = ' '.join([decoder_dict[predicts[0][i][j]] for j in out_pp]) 
        
        print('-REVIEW_TEXT-:\n', review_text)
        print('---'*10)
        print('GOLD SUMMARY: ', gold_summary)
        print('PRED SUMMARY: ', pred_summary, '\n')
        print('---'*20)
        
    break

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 3 arrays: [array([[101, 103, 103, ..., 100, 100, 100],
       [101, 103, 103, ..., 100, 100, 100],
       [101, 103, 103, ..., 100, 100, 100],
       [101, 103, 103, ..., 100, 100, 100],
       [101, 103, 103, ...

In [28]:
# Predict
for inputs, outputs in _generator(train=False):
    predicts = model.predict(inputs)
    outputs = list(map(lambda x: np.squeeze(x, axis=-1), outputs))
    predicts = list(map(lambda x: np.argmax(x, axis=-1), predicts))
    batch_size, seq_len = inputs[-1].shape
    
    for i in range(batch_size):

        in_pp = np.where(inputs[0][i] == 102)[0][1] # index of end 'SEP'
        out_pp = np.where(inputs[-1][i] == 1)[0] # index of masked

        review_text = ' '.join([decoder_dict[inputs[0][i][j]] for j in range(in_pp)])
        gold_summary = ' '.join([decoder_dict[outputs[0][i][j]] for j in out_pp]) # print masked
        pred_summary = ' '.join([decoder_dict[predicts[0][i][j]] for j in out_pp]) 
        
        print('-REVIEW_TEXT-:\n', review_text)
        print('---'*10)
        print('GOLD SUMMARY: ', gold_summary)
        print('PRED SUMMARY: ', pred_summary, '\n')
        print('---'*20)
        
    break

-REVIEW_TEXT-:
 [CLS] we have many of the old , old issue but the number had depleted there were not enough books to allow us to use them regularly with the additional supply the books will be used more often they ar ##re a good old stand ##by for gospel singing [SEP] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]
------------------------------
GOLD SUMMARY:  i was disappointed that you would only allow me to purchase 4 when your inventory showed that you had 14 available
PRED SUMMARY:  [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

------------------------------------------------------------
-REVIEW_TEXT-:
 [CLS] i or ##igo ##nally did not get the item i ordered when contact ##ing the company they got back with me quickly it turned out the item was out of stock they sent it once it came in stock this was no

In [23]:
# Predict
for inputs, outputs in _generator(train=False):
    predicts = model.predict(inputs)
    outputs = list(map(lambda x: np.squeeze(x, axis=-1), outputs))
    predicts = list(map(lambda x: np.argmax(x, axis=-1), predicts))
    batch_size, seq_len = inputs[-1].shape
    
    for i in range(batch_size):

        in_pp = np.where(inputs[0][i] == 102)[0][1] # index of end 'SEP'
        out_pp = np.where(inputs[-1][i] == 1)[0] # index of masked

        review_text = ' '.join([decoder_dict[inputs[0][i][j]] for j in range(in_pp)])
        gold_summary = ' '.join([decoder_dict[outputs[0][i][j]] for j in out_pp]) # print masked
        pred_summary = ' '.join([decoder_dict[predicts[0][i][j]] for j in out_pp]) 
        
        print('-REVIEW_TEXT-:\n', review_text)
        print('---'*10)
        print('GOLD SUMMARY: ', gold_summary)
        print('PRED SUMMARY: ', pred_summary, '\n')
        print('---'*20)
        
    break

-REVIEW_TEXT-:
 [CLS] we have many of the old , old issue but the number had depleted there were not enough books to allow us to use them regularly with the additional supply the books will be used more often they ar ##re a good old stand ##by for gospel singing [SEP] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]
------------------------------
GOLD SUMMARY:  i was disappointed that you would only allow me to purchase 4 when your inventory showed that you had 14 available
PRED SUMMARY:  the , the the the , , , , , , , , , , , , the the the book 

------------------------------------------------------------
-REVIEW_TEXT-:
 [CLS] i or ##igo ##nally did not get the item i ordered when contact ##ing the company they got back with me quickly it turned out the item was out of stock they sent it once it came in stock this was not a problem for me since i ordered the item way in advance for my d

In [388]:
# for pairs in create_review_pairs(6):
#     print(pairs['review_pair'])
#     print('---'*10)

[[['we', 'have', 'many', 'of', 'the', 'old', ',', 'old', 'issue', 'but', 'the', 'number', 'had', 'depleted', 'there', 'were', 'not', 'enough', 'books', 'to', 'allow', 'us', 'to', 'use', 'them', 'regularly', 'with', 'the', 'additional', 'supply', 'the', 'books', 'will', 'be', 'used', 'more', 'often', 'they', 'ar', '##re', 'a', 'good', 'old', 'stand', '##by', 'for', 'gospel', 'singing'], ['i', 'was', 'disappointed', 'that', 'you', 'would', 'only', 'allow', 'me', 'to', 'purchase', '4', 'when', 'your', 'inventory', 'showed', 'that', 'you', 'had', '14', 'available']], [['i', 'or', '##igo', '##nally', 'did', 'not', 'get', 'the', 'item', 'i', 'ordered', 'when', 'contact', '##ing', 'the', 'company', 'they', 'got', 'back', 'with', 'me', 'quickly', 'it', 'turned', 'out', 'the', 'item', 'was', 'out', 'of', 'stock', 'they', 'sent', 'it', 'once', 'it', 'came', 'in', 'stock', 'this', 'was', 'not', 'a', 'problem', 'for', 'me', 'since', 'i', 'ordered', 'the', 'item', 'way', 'in', 'advance', 'for', 'my

In [417]:
# Train model -512

# Create checkpoint callback
checkpoint_retrain = "training_2/cp-{epoch:04d}.ckpt"
checkpoint_retrain_dir = os.path.dirname(checkpoint_retrain)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_retrain, verbose=1, save_weights_only=True,
    # Save weights, every 5-epochs.
    period=5)
# pass callback to training


model.fit_generator(
    generator=_generator(train=True),
    steps_per_epoch=10,
    epochs=2,
    validation_data=_generator(),
    validation_steps=1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    ], #
)
#cp_callback

Epoch 1/2


ResourceExhaustedError: OOM when allocating tensor with shape[6,512,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training_4/AdamWarmup/gradients/zeros_34}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node loss_5/add}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [274]:

# sentence_pairs = [
#     [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
#     [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
#     [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']],
# ]

# def _generator():
#     while True:
#         yield gen_batch_inputs(
#             sentence_pairs,
#             token_dict,
#             token_list,
#             seq_len=384)
# #                 mask_rate= #0.3,#default 0.15
# #                 swap_sentence_rate=0,
# #             ) 

In [164]:
# # Predict
# for inputs, outputs in _generator():
#     predicts = model.predict(inputs)
#     outputs = list(map(lambda x: np.squeeze(x, axis=-1), outputs))
#     predicts = list(map(lambda x: np.argmax(x, axis=-1), predicts))
#     batch_size, seq_len = inputs[-1].shape
#     for i in range(batch_size):
#         for j in range(seq_len):
#             if inputs[-1][i][j]:
#                 print(outputs[0][i][j], predicts[0][i][j])
#                 print(decoder_dict[int(outputs[0][i][j])], decoder_dict[int(predicts[0][i][j])])
#     print(np.allclose(outputs[1], predicts[1]))
# #     break

In [None]:
# Load trained model
model = keras.models.load_model(
    model_path,
    custom_objects=get_custom_objects(),
)

In [3]:
# # Build the model
# model = get_model(
#     token_num=len(token_dict),
#     head_num=5,
#     transformer_num=12,
#     embed_dim=25,
#     feed_forward_dim=100,
#     seq_len=20,
#     pos_num=20,
#     dropout_rate=0.05,
# #     training=False,
# #     trainable=False,
#     output_layer_num=4,
# )
# model.summary()

Using TensorFlow backend.


In [None]:
def evaluate(model, data, model_name='trs', ty='valid', verbose=True):
    hyp_g, ref, r1, r2, rl, r_avg = [],[],[],[],[],[]
    t = Translator(model)
    rouge = Rouge()

    l, loss = [], None
    pbar = tqdm(enumerate(data),total=len(data))
    for j, batch in pbar:
        if ty!="test":
            loss = model.train_one_batch(batch, train=False)
            l.append(loss.item())
            
        if((j<=1 and ty != "test") or ty =="test"): 
            if ty!='test':
                sent_g = model.decoder_greedy(batch) # 1-decoder generation. for testing
            else:
                sent_g = model.eval_one_batch(batch) # 2-decoder generation.
            # sent_b, _ = t.translate_batch(batch) # beam search

            for i, sent in enumerate(sent_g):
                hyp_g.append(sent) 
                ref.append(batch["target_txt"][i])
                rouges = rouge.get_scores(sent,batch["target_txt"][i])[0] # (hyp, ref)

                r1_val,r2_val,rl_val = rouges['rouge-1']["f"], rouges['rouge-2']["f"], rouges['rouge-l']["f"]
                r1.append(r1_val)
                r2.append(r2_val)
                rl.append(rl_val)
                r_avg.append(np.mean([r1_val,r2_val,rl_val]))
        pbar.set_description("EVAL loss:{:.4f} r_avg:{:.2f}".format(np.mean(l),np.mean(r_avg)))
        if(j>1 and ty=="train"): break
    if l: loss = np.mean(l)
    r_avg = np.mean(r_avg)
    r1 = np.mean(r1)
    r2 = np.mean(r2)
    rl = np.mean(rl)

    if(verbose):
        print("\nEVAL loss: {:.4f} r_avg: [{:.2f}] r1: {:.2f} r2: {:.2f} rl: {:.2f}".format(loss, r_avg, r1, r2, rl))
        for hyp, gold in zip(hyp_g, ref):
            print("HYP: ")
            print(hyp)
            print("GOLD: ")
            print(gold)

In [None]:
inputs

In [None]:
# A toy input example
sentence_pairs = [
    [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
    [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
    [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']],
]