In [20]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [25]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from tensorflow.keras.utils import Progbar

import modeling, optimization, tokenization
# from run_pretraining import input_fn_builder, model_fn_builder

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

In [2]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

RAW_DATA_FPATH = "../data/imdb/name.basics.tsv"
PRC_DATA_FPATH = "../data/name_basics_processed.txt"

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)



In [3]:
MODEL_PREFIX = "tokenizer" #@param {type: "string"}
VOC_SIZE = 32000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [4]:
row = "nm0000002 lauren bacall n n actress soundtrack tt0038355 tt0071877 tt0117057 tt0037382"

In [27]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token
    
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

# Conventional BERT control symbols 
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

VOC_FNAME = "name_basics_vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")
    

Learnt vocab size: 31743
Sample tokens: ['▁stephan', '0635', '▁rowena', '▁giulio', '▁kitty', '1353056', '4002', '▁sheena', '0286486', '▁fass']
32000


In [28]:
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME)
bert_tokenizer.tokenize(row)

['nm',
 '##0000',
 '##002',
 'lauren',
 'baca',
 '##ll',
 'n',
 'n',
 'act',
 '##ress',
 'soundtrack',
 'tt',
 '##0038',
 '##355',
 'tt',
 '##0071',
 '##877',
 'tt',
 '##0117',
 '##057',
 'tt',
 '##0037',
 '##382']

In [8]:
# Since the dataset is fairly large, we shard them into smaller files
!mkdir ./shards
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

shard_0000  shard_0007	shard_0014  shard_0021	shard_0028  shard_0035
shard_0001  shard_0008	shard_0015  shard_0022	shard_0029  shard_0036
shard_0002  shard_0009	shard_0016  shard_0023	shard_0030  shard_0037
shard_0003  shard_0010	shard_0017  shard_0024	shard_0031  shard_0038
shard_0004  shard_0011	shard_0018  shard_0025	shard_0032
shard_0005  shard_0012	shard_0019  shard_0026	shard_0033
shard_0006  shard_0013	shard_0020  shard_0027	shard_0034


In [35]:
# Generate Per-training data 
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}


XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

XARGS_CMD

'ls ./shards/ | xargs -n 1 -P 2 -I{} python3 create_pretraining_data.py --input_file=./shards/{} --output_file=pretraining_data/{}.tfrecord --vocab_file=name_basics_vocab.txt --do_lower_case=True --max_predictions_per_seq=20 --max_seq_length=128 --masked_lm_prob=0.15 --random_seed=34 --dupe_factor=5'

In [None]:
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
  from ._conv import register_converters as _register_converters
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards/shard_0001
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards/shard_0000
INFO:tensorflow:*** Writing to output files ***
INFO:tensorflow:  pretraining_

INFO:tensorflow:*** Writing to output files ***
INFO:tensorflow:  pretraining_data/shard_0000.tfrecord
INFO:tensorflow:*** Example ***
INFO:tensorflow:tokens: [CLS] nm ##0200 ##323 zsolt dan ##ko n n assistant [UNK] director miscellaneo ##us [MASK] tt [MASK] ##035 tt ##0803 [MASK] tt ##0117 ##809 tt ##0251 ##636 nm ##0200 ##324 amy dan ##les n n act ##ress producer [MASK] ##0215 ##760 tt ##0108 ##949 [MASK] ##0128 ##419 tt ##2103188 nm ##0200325 j mark dan ##ley n n actor tt ##0100 ##003 [SEP] nm [MASK] ##326 [MASK] dan ##ley n n act [MASK] tt ##0095 ##069 nm ##0200 ##327 kevin dan ##lo ##e n n [MASK] tt ##0217 [MASK] tt ##0359 ##287 [MASK] ##0484 ##459 nm [MASK] ##328 [MASK] [MASK] ##lon n [MASK] ##1294 tt ##0098 ##328 nm ##0200 ##329 mary j dan [MASK] [MASK] n editor tt ##5958 ##184 tt ##0371 ##436 tt ##0334 ##875 tt ##024 [SEP]
INFO:tensorflow:input_ids: 2 7 3798 773 16332 202 200 5 5 35 1 20 15 14 4 6 4 139 6 11379 4 6 1255 1430 6 9750 580 7 3798 372 1314 202 1905 5 5 13 12 16 

INFO:tensorflow:Wrote 250704 total instances
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards/shard_0002
INFO:tensorflow:Wrote 311850 total instances
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
INFO:tensorflow:*** Reading from input files ***
INFO:tensorflow:  ./shards

INFO:tensorflow:*** Writing to output files ***
INFO:tensorflow:  pretraining_data/shard_0002.tfrecord
INFO:tensorflow:*** Example ***
INFO:tensorflow:tokens: [CLS] nm ##0795 ##831 jamie shum [MASK] n n actor tt ##0094 ##138 nm ##0795 ##832 mccall shum ##an n n producer [MASK] director tt ##0814 ##022 tt ##1091 [MASK] tt ##1251 ##757 tt ##0282 ##209 [SEP] nm ##0795 ##834 larry shum ##an n n producer manager tt ##2706 ##482 tt ##0113 ##553 [MASK] ##3322 ##364 tt ##2226 ##342 nm ##0795 [MASK] mark orr [MASK] shum ##an n n music [UNK] [MASK] ##part ##ment tt ##0097 ##216 tt ##0094416 tt ##0103 ##640 tt ##0097 ##405 nm ##0795 ##836 michael [MASK] [MASK] n n actor soundtrack tt ##0120 ##888 tt ##2089 ##617 tt [MASK] tt ##4287320 nm ##0795 ##837 ##5336 shum ##an n n soundtrack composer actor tt ##0116 [MASK] tt ##0372 ##532 ##enka ##3110958 tt ##1931 [MASK] [SEP]
INFO:tensorflow:input_ids: 2 7 3036 739 1525 18514 4 5 5 9 6 774 227 7 3036 741 11903 18514 115 5 5 16 4 20 6 2189 94 6 912 4 

In [33]:
# Training config
MODEL_DIR = "bert_model" #@param {type:"string"}
tf.gfile.MkDir(MODEL_DIR)

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_base_config, fo, indent=2)
  
with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [None]:
# Actual Training steps 
BUCKET_NAME = "bert_resourses" #@param {type:"string"}
MODEL_DIR = "bert_model" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}

# Input data pipeline config
TRAIN_BATCH_SIZE = 128 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 2500 #@param {type:"integer"}

VOCAB_FILE = os.path.join(MODEL_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(MODEL_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(MODEL_DIR)

bert_config = modeling.BertConfig.from_json_file(os.path.join(MODEL_DIR, "bert_config.json"))
input_files = tf.gfile.Glob(os.path.join(PRETRAINING_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

In [None]:
!mkdir pretraining_outputs
!python run_pretraining.py \
  --input_file=./pretraining_data/shard_0000.tfrecord \
  --output_dir=./pretraining_outputs/pretraining_output_0000 \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=./bert_model/bert_config.json \
  --train_batch_size=32 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=10000 \
  --num_warmup_steps=10 \
  --learning_rate=2e-5

mkdir: cannot create directory ‘pretraining_outputs’: File exists
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
INFO:tensorflow:*** Input Files ***
INFO:tensorflow:  ./pretraining_data/shard_0000.tfrecord
INFO:tensorflow:Using config: {'_model_dir': './pretraining_outputs/pretraining_output_0000', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_de

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
2020-04-10 20:20:17.472160: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
INFO:tensorflow:Restoring parameters from ./pretraining_outputs/pretraining_output_0000/model.ckpt-1020
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1020 into ./pretraining_outputs/pretraining_output_0000/model.ckpt.
INFO:tensorflow:global_step/sec: 0.179644
INFO:tensorflow:examples/sec: 5.74861
INFO:tensorflow:global_step/sec: 0.18064
INFO:tensorflow:examples/sec: 5.78047
INFO:tensorflow:global_step/sec: 0.180996
INFO:tensorflow:examples/sec: 5.79189
INFO:tensorflow:global_step/sec: 0.179858
INFO:tensorflow:examples/sec: 5.75547
INFO:tensorflow:global_step/sec: 0.178847
INFO:tensorflow:examples/sec: 5.72311
INFO:tensorfl