In [1]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False


In [2]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from tensorflow.keras.utils import Progbar

import modeling, optimization, tokenization
# from run_pretraining import input_fn_builder, model_fn_builder

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)




In [6]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

RAW_DATA_FPATH = "../node2vec/walks/sample.edgelist.txt"
PRC_DATA_FPATH = "../node2vec/walks/sample.edgelist_processed.txt"

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)



In [9]:
MODEL_PREFIX = "tokenizer" #@param {type: "string"}
VOC_SIZE = 3200000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1 --hard_vocab_limit=false').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [11]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token
    
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

# Conventional BERT control symbols 
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

VOC_FNAME = "name_basics_vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")
    

Learnt vocab size: 1245
Sample tokens: ['▁762', '942', '▁862', '45', '499', '260', '▁83', '▁637', '964', '486']
3200000


In [13]:
row = "student662 col:student student760 col:name"

In [14]:
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME)
bert_tokenizer.tokenize(row)

['student',
 '##66',
 '##2',
 '[UNK]',
 '[UNK]',
 'student',
 'student',
 '##760',
 '[UNK]',
 '[UNK]',
 'name']

In [15]:
# Since the dataset is fairly large, we shard them into smaller files
!mkdir ./shards
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

shard_0000


In [16]:
# Generate Per-training data 
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}


XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

XARGS_CMD

'ls ./shards/ | xargs -n 1 -P 2 -I{} python3 create_pretraining_data.py --input_file=./shards/{} --output_file=pretraining_data/{}.tfrecord --vocab_file=name_basics_vocab.txt --do_lower_case=True --max_predictions_per_seq=20 --max_seq_length=128 --masked_lm_prob=0.15 --random_seed=34 --dupe_factor=5'

In [17]:
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD



W1215 07:30:24.844217 139799166678784 module_wrapper.py:139] From create_pretraining_data.py:437: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W1215 07:30:24.844340 139799166678784 module_wrapper.py:139] From create_pretraining_data.py:437: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W1215 07:30:24.844450 139799166678784 module_wrapper.py:139] From /home/cc/Embedding/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W1215 07:30:33.753650 139799166678784 module_wrapper.py:139] From create_pretraining_data.py:444: The name tf.gfile.Glob is deprecated. Please use tf.io.gfile.glob instead.


W1215 07:30:33.755300 139799166678784 module_wrapper.py:139] From create_pretraining_data.py:446: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

INFO:tensorflow:*** Reading from input files ***
I1215 07:

I1215 07:30:46.870143 139799166678784 create_pretraining_data.py:149] *** Example ***
INFO:tensorflow:tokens: [CLS] 2 [UNK] [MASK] ##8 iphone [UNK] [MASK] ##7 [MASK] [UNK] [MASK] ##8 [MASK] ##808 [UNK] student student ##24 [MASK] [MASK] student ##200 [UNK] student student ##702 [UNK] 702 student ##702 [UNK] 702 student ##702 [UNUSED_1144200] 702 8 [UNK] 73 ##3 8 [UNK] 72 ##2 student ##72 ##2 [MASK] student [MASK] [MASK] [UNK] student student ##374 [UNK] student student ##134 [UNK] student student ##85 [SEP] [MASK] 84 ##6 macbook [UNK] [MASK] student [MASK] [UNK] student student ##22 ##9 [UNK] 22 ##9 student ##22 ##9 [UNK] name student ##94 [UNK] student [MASK] ##51 ##2 [UNK] name student ##81 [UNK] name [MASK] ##132 [UNK] [MASK] student ##609 [UNUSED_2466620] name [MASK] ##136 [UNK] 136 iphone [UNK] 895 student ##895 [UNK] 895 iphone [UNK] 250 iphone [UNK] 136 student ##136 [UNK] [SEP]
I1215 07:30:46.870229 139799166678784 create_pretraining_data.py:151] tokens: [CLS] 2 [UNK] [MASK] 

INFO:tensorflow:Wrote 10090 total instances
I1215 07:30:48.987468 139799166678784 create_pretraining_data.py:166] Wrote 10090 total instances


In [18]:
# Training config
MODEL_DIR = "bert_model" #@param {type:"string"}
tf.gfile.MkDir(MODEL_DIR)

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": VOC_SIZE
}

with open("{}/bert_config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_base_config, fo, indent=2)
  
with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [19]:
# Actual Training steps 
BUCKET_NAME = "bert_resourses" #@param {type:"string"}
MODEL_DIR = "bert_model" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}

# Input data pipeline config
TRAIN_BATCH_SIZE = 128 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 2500 #@param {type:"integer"}

VOCAB_FILE = os.path.join(MODEL_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(MODEL_DIR, "bert_config.json")

INIT_CHECKPOINT = tf.train.latest_checkpoint(MODEL_DIR)

bert_config = modeling.BertConfig.from_json_file(os.path.join(MODEL_DIR, "bert_config.json"))
input_files = tf.gfile.Glob(os.path.join(PRETRAINING_DIR,'*tfrecord'))

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))

INFO:tensorflow:Using checkpoint: None
INFO:tensorflow:Using 1 data shards


In [None]:
!mkdir pretraining_outputs
!python run_pretraining.py \
  --input_file=./pretraining_data/shard_0000.tfrecord \
  --output_dir=./pretraining_outputs/pretraining_output_0000 \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=./bert_model/bert_config.json \
  --train_batch_size=32 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=10000 \
  --num_warmup_steps=10 \
  --learning_rate=2e-5




W1215 07:30:51.821204 139862023464704 module_wrapper.py:139] From run_pretraining.py:407: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W1215 07:30:51.821326 139862023464704 module_wrapper.py:139] From run_pretraining.py:407: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W1215 07:30:51.821435 139862023464704 module_wrapper.py:139] From /home/cc/Embedding/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W1215 07:30:51.821898 139862023464704 module_wrapper.py:139] From run_pretraining.py:414: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.


W1215 07:30:51.822062 139862023464704 module_wrapper.py:139] From run_pretraining.py:418: The name tf.gfile.Glob is deprecated. Please use tf.io.gfile.glob instead.


W1215 07:30:51.822780 139862023464704 module_wrapper.py:139] From run_pretraining.py:420: The na

Instructions for updating:
Use `tf.data.experimental.parallel_interleave(...)`.
W1215 07:30:52.415695 139862023464704 deprecation.py:323] From run_pretraining.py:368: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.experimental.parallel_interleave(...)`.
Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
W1215 07:30:52.415787 139862023464704 deprecation.py:323] From /home/cc/miniconda3/envs/new-py3-env/lib/python3.7/site-packages/tensorflow_core/contrib/data/python/ops/interleave_ops.py:77: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.interlea


W1215 07:30:53.887334 139862023464704 module_wrapper.py:139] From run_pretraining.py:150: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.

INFO:tensorflow:**** Trainable Variables ****
I1215 07:30:53.888013 139862023464704 run_pretraining.py:167] **** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (3200000, 768)
I1215 07:30:53.888088 139862023464704 run_pretraining.py:173]   name = bert/embeddings/word_embeddings:0, shape = (3200000, 768)
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768)
I1215 07:30:53.888177 139862023464704 run_pretraining.py:173]   name = bert/embeddings/token_type_embeddings:0, shape = (2, 768)
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768)
I1215 07:30:53.888259 139862023464704 run_pretraining.py:173]   name = bert/embeddings/position_embeddings:0, shape = (512, 768)
INFO:tensorflow:  name 

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W1215 07:30:54.023868 139862023464704 deprecation.py:323] From /home/cc/miniconda3/envs/new-py3-env/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1375: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
  num_elements)
INFO:tensorflow:Done calling model_fn.
I1215 07:30:58.395998 139862023464704 estimator.py:1150] Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
I1215 07:30:58.397856 139862023464704 basic_session_run_hooks.py:541] Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
I1215 07:31:00.724864 139862023464704 monitored_session.py:240] Graph was finalized.
2020-12-15 07:31:00.725304: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary