# Configure settings

Noe: If using a TPU from Google Cloud (not the Colab TPU), make sure to run this notebook on a VM with access to all GCP APIs

Note: Run multiple copies of this notebook in multiple VMs to train multiple models in parallel

In [1]:
#@markdown ## General Config
USE_GCP_TPU = True #@param {type:"boolean"}
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = False #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
MODEL_DIR = "bert_model_modified_large" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data_1024_modified_large" #@param {type:"string"}
LOGGING_DIR = "bert_model_modified_large_loss_spam" #@param {type:"string"}
#@markdown ######for miscellaneous temporary storage
TEMP_DIR = "modified_large_temp" #@param {type:"string"}
RUN_NAME = "human_pretraining" #@param {type:"string"}
#import time
#RUN_NAME = RUN_NAME+"-"+time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
VOC_FNAME = "vocab.txt" #@param {type:"string"}

#@markdown ## Input data pipeline config
DATA_COPIES = 20 #@param {type:"integer"}
TRAIN_BATCH_SIZE =  32 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

#@markdown ### Training procedure config
EVAL_BATCH_SIZE = 64
INIT_LEARNING_RATE =  2e-5#@param {type:"number"}
END_LEARNING_RATE = 1e-9
SAVE_CHECKPOINTS_STEPS =  1000#@param {type:"integer"}
NUM_TPU_CORES = 8
PLANNED_TOTAL_SEQUENCES_SEEN =  1e9 #@param {type:"number"}
#@markdown ###### (PLANNED_TOTAL_STEPS will override PLANNED_TOTAL_SEQUENCES_SEEN; if you wish to use PLANNED_TOTAL_SEQUENCES_SEEN, set PLANNED_TOTAL_STEPS to -1)
PLANNED_TOTAL_STEPS =  2e6#@param {type:"number"}
PLANNED_TOTAL_STEPS = PLANNED_TOTAL_SEQUENCES_SEEN/TRAIN_BATCH_SIZE if PLANNED_TOTAL_STEPS==-1 else PLANNED_TOTAL_STEPS
DECAY_PER_STEP = (END_LEARNING_RATE-INIT_LEARNING_RATE)/PLANNED_TOTAL_STEPS
#@markdown ## Model Config:
#@markdown ######Possible values for MODEL_TO_USE: orig, withConv:
MODEL_TO_USE = "withConv" #@param {type:"string"}
HIDDEN_SIZE =   768#@param {type:"integer"}
HIDDEN_LAYERS =  12#@param {type:"integer"}

CUSTOM_MODEL = None ##change this to a model_fn style function if you wish to use a custom model

bert_config = {
  "hidden_size": HIDDEN_SIZE, 
  "hidden_act": "gelu", 
  "initializer_range": 0.02, 
  "hidden_dropout_prob": 0.1, 
  "num_attention_heads": HIDDEN_LAYERS, 
  "type_vocab_size": 2, 
  "max_position_embeddings": MAX_SEQ_LENGTH, 
  "num_hidden_layers": HIDDEN_LAYERS, 
  "intermediate_size": 3072, 
  "attention_probs_dropout_prob": 0.1
}

import os
import json
vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''


#If using a local runtime for GCP TPU, these things also need to be set up

In [2]:
GCE_PROJECT_NAME = "genome-project-319100" #@param {type:"string"}
TPU_ZONE = "us-central1-f" #@param {type:"string"}
TPU_NAME = "mutformer-tpu" #@param {type:"string"}

!gcloud alpha compute tpus create $TPU_NAME --accelerator-type=tpu-v2 --version=1.15.5 --zone=$TPU_ZONE ##create new TPU

!gsutil iam ch serviceAccount:`gcloud alpha compute tpus describe $TPU_NAME | grep serviceAccount | cut -d' ' -f2`:admin gs://theodore_jiang && echo 'Successfully set permissions!' ##give TPU access to GCS

!sudo -H pip3 install tensorflow==1.14 ##pip install some stuff
!sudo -H pip3 install --upgrade google-api-python-client
!sudo -H pip3 install --upgrade oauth2client

[1;31mERROR:[0m (gcloud.alpha.compute.tpus.create) ALREADY_EXISTS: Resource 'projects/genome-project-319100/locations/us-central1-f/nodes/mutformer-tpu' already exists
- '@type': type.googleapis.com/google.rpc.ResourceInfo
  resourceName: projects/genome-project-319100/locations/us-central1-f/nodes/mutformer-tpu
No changes made to gs://theodore_jiang/
Successfully set permissions!
Requirement already up-to-date: google-api-python-client in /usr/local/lib/python3.7/dist-packages (2.19.1)
Requirement already up-to-date: oauth2client in /usr/local/lib/python3.7/dist-packages (4.1.3)


#Clone the repository

In [3]:
if USE_GCP_TPU:
  !sudo apt-get -y install git-all
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://tianqitheodorejiang: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@github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-all is already the newest version (1:2.20.1-2+deb10u3).
The following packages were automatically installed and are no longer required:
  libargon2-1 libcryptsetup12 systemd
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 3 not upgraded.
Cloning into 'code/mutformer'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 79 (delta 31), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (79/79), done.


#Imports/Authenticate for GCP

In [4]:
if USE_GCP_TPU:
  !sudo -H pip3 install tensorflow==1.14
  !sudo -H pip3 install --upgrade google-api-python-client
  !sudo -H pip3 install --upgrade oauth2client
import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil

print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

from glob import glob
if not USE_GCP_TPU:
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_pretraining import input_fn_builder, model_fn_builder

if MODEL_TO_USE=="orig":
  MODEL = BertModel
  print("Using model: orig")
elif MODEL_TO_USE == "withConv":
  MODEL = BertModelModified
  print("Using model: withConv")
else:
  raise Exception("The model specified was not one of the available models: [\"orig\", \"withConv\"].")

  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#@markdown ###### Whether or not to write logs to a file
DO_FILE_LOGGING = True #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown ###### If using file logging, what path to write logs to
  FILE_LOGGING_PATH = 'file_logging/spam.log' #@param {type:"string"}
  if not os.path.exists("/".join(FILE_LOGGING_PATH.split("/")[:-1])):
    os.makedirs("/".join(FILE_LOGGING_PATH.split("/")[:-1]))
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

if USE_GCP_TPU:
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_NAME, zone=TPU_ZONE, project=GCE_PROJECT_NAME)
  TPU_ADDRESS = tpu_cluster_resolver.get_master()
  USE_TPU = True
  with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      # Upload credentials to TPU.
      tf.contrib.cloud.configure_gcs(session)
else:
  if 'COLAB_TPU_ADDR' in os.environ:
    log.info("Using TPU runtime")
    USE_TPU = True
    TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

    with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      # Upload credentials to TPU.
      with open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
      tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
      
  else:
    log.warning('Not connected to TPU runtime')
    USE_TPU = False

vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''

if not os.path.exists(MODEL_DIR):
  os.mkdir(MODEL_DIR)
  
with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in vocab.split("\n"):
    fo.write(token+"\n")


Requirement already up-to-date: google-api-python-client in /usr/local/lib/python3.7/dist-packages (2.19.1)
Requirement already up-to-date: oauth2client in /usr/local/lib/python3.7/dist-packages (4.1.3)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


2021-09-04 04:15:59





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Using model: withConv


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.


2021-09-04 04:16:00,369 - tensorflow - INFO - TPU address is grpc://10.117.27.218:8470


#Auto Detect amount of train steps per epoch in the source data

In [5]:
if USE_GCP_TPU: ##if using a GCP TPU, drive no longer works because a local runtime is used, so we have to access the GCS for steps detection
  from tqdm import tqdm
  def steps_getter(input_files,batch_size):
    tot_sequences = 0
    for input_file in input_files:
      print("reading:",input_file)

      d = tf.data.TFRecordDataset(input_file)

      with tf.Session() as sess:
        tot_sequences+=sess.run(d.reduce(0, lambda x,_: x+1))

    return tot_sequences

  BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
  got_data = False
  while not got_data: ##will keep trying to access the data until available
    for f in range(0,DATA_COPIES):
        DATA_GCS_DIR_train = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR+"/"+str(f))
        train_input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR_train,'*tfrecord'))
        print("Using:",train_input_files)
        if len(train_input_files)>0:
          got_data = True
          SEQUENCES_PER_EPOCH = steps_getter(train_input_files,TRAIN_BATCH_SIZE)
          STEPS_PER_EPOCH = int(SEQUENCES_PER_EPOCH/TRAIN_BATCH_SIZE)
          print("sequences per epoch:",SEQUENCES_PER_EPOCH, "steps per epoch:",STEPS_PER_EPOCH)
          break
    if got_data:
      break
    print("Could not find data, waiting for data generation...trying again in another "+str(1200)+" seconds.")
    time.sleep(1200)

else:
  from google.colab import drive
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH = "/content/drive/My Drive"
  #@markdown ###### To minimize interaction with GCS, the steps for each portion of the dataset are determined using the original data from the google drive (unless GCP_TPU is used)
  data_folder = "BERT pretraining/mutformer_pretraining_data" #@param {type: "string"}

  data_path_train = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+".txt" 
  DATA_FPATH_train = "dataset_train.txt"

  if os.path.exists(DATA_FPATH_train):
    os.remove(DATA_FPATH_train)
  shutil.copy(data_path_train,DATA_FPATH_train)

  lines = open(DATA_FPATH_train).read().split("\n")
  SEQUENCES_PER_EPOCH = len(lines)
  STEPS_PER_EPOCH = int(SEQUENCES_PER_EPOCH/TRAIN_BATCH_SIZE)

  print("sequences per epoch:",SEQUENCES_PER_EPOCH, "steps per epoch:",STEPS_PER_EPOCH)


2021-09-04 04:16:00.503670: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2021-09-04 04:16:00.507995: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2299995000 Hz
2021-09-04 04:16:00.508794: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x4f5b830 executing computations on platform Host. Devices:
2021-09-04 04:16:00.508915: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>


Using: ['gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0000.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0001.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0002.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0003.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0004.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0005.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0006.tfrecord']
reading: gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0000.tfrecord
reading: gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0001.tfrecord
reading: gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0002.tfrecord
reading: gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0003.tfrecord
reading: gs://theodore_jiang/pretraining_data_1024_modified_large/

#Upload config to GCS

In [None]:
bert_config["vocab_size"] = len(vocab.split("\n"))

with open("{}/config.json".format(MODEL_DIR), "w") as fo:
  json.dump(bert_config, fo, indent=2)

if not os.path.exists(LOGGING_DIR+"/"+RUN_NAME):
  os.makedirs(LOGGING_DIR+"/"+RUN_NAME)
cmd="touch \"" + LOGGING_DIR+"/"+RUN_NAME+"/running.txt\""
!{cmd}

!gsutil -m cp -r $MODEL_DIR gs://$BUCKET_NAME
!gsutil -m cp -r $LOGGING_DIR gs://$BUCKET_NAME

# Run Training

In [None]:
import time

operating_files = ["available_indexes","epoch"]

def download_tmp_files(operating_files): ##for downloading tmp files from drive or GCS
  for op_file in operating_files:
    if USE_GCP_TPU: ##If using GCP TPU, drive isn't available, so we need to store temporary files in GCS
      cmd = "gsutil -m cp -r gs://"+BUCKET_NAME+"/"+TEMP_DIR+"/"+op_file+".txt "+TEMP_DIR+"/"+op_file+".txt"
      !{cmd}
    else:
      shutil.copy(DRIVE_PATH+"/"+TEMP_DIR+"/"+op_file+".txt",TEMP_DIR+"/"+op_file+".txt")

def upload_tmp_files(operating_files): ##for uploading tmp files to drive or GCS
  for op_file in operating_files:
    if USE_GCP_TPU: ##doing the same thing as above^^
      cmd = "gsutil -m cp -r "+TEMP_DIR+"/"+op_file+".txt gs://"+BUCKET_NAME+"/"+TEMP_DIR+"/"+op_file+".txt"
      !{cmd}
    else:
      shutil.copy(TEMP_DIR+"/"+op_file+".txt",DRIVE_PATH+"/"+TEMP_DIR+"/"+op_file+".txt")

download_tmp_files(operating_files)

if os.path.exists(TEMP_DIR+"/epoch.txt"): ##detect the current epoch
  current_epoch = int(open(TEMP_DIR+"/epoch.txt").read())
else:
  current_epoch=0

BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME)

CONFIG_FILE = os.path.join(BERT_GCS_DIR, "config.json")

while True: ##training loop
  print("\n\n\n\n\nEPOCH:"+str(current_epoch)+"\n\n\n\n\n\n")
  
  got_data = False
  while not got_data:
    for f in range(0,DATA_COPIES): ##try to access any of the data bins
      print("trying to access training data from saved sector number "+str(f))
      DATA_GCS_DIR_train = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR+"/"+str(f))
      train_input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR_train,'*tfrecord'))
      print("train_input_files:",train_input_files)
      if len(train_input_files)>0:
        got_data = True
        break
      else:
        current_available_indexes = open(TEMP_DIR+"/available_indexes.txt").read().split("\n")[:-1]
        print("current:",current_available_indexes)

        new_inds = ""
        for ind in current_available_indexes:
          if int(ind) != f:
            new_inds += ind +"\n"
        print("new_inds",new_inds)
        open(TEMP_DIR+"/available_indexes.txt","w+").write(new_inds)
    upload_tmp_files(["available_indexes"])
    if not got_data:
      time.sleep(300)
        

  INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)
  try:
    INIT_CHECKPOINT_STEP = INIT_CHECKPOINT.split("-")[-1]
    print("CURRENT STEP:",INIT_CHECKPOINT_STEP)
    if int(INIT_CHECKPOINT_STEP)>=PLANNED_TOTAL_STEPS: ##if reached planed total steps, stop
      break
  except:
    pass

  config = modeling.BertConfig.from_json_file(CONFIG_FILE)

  log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
  log.info("Using {} data shards for training".format(len(train_input_files)))
  model_fn = model_fn_builder(
      bert_config=config,
      logging_dir=GCS_LOGGING_DIR,
      init_checkpoint=INIT_CHECKPOINT,
      init_learning_rate=INIT_LEARNING_RATE,
      decay_per_step=DECAY_PER_STEP,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True,
      bert=MODEL)

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=BERT_GCS_DIR,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=USE_TPU,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=TRAIN_BATCH_SIZE,
      eval_batch_size=EVAL_BATCH_SIZE)
    
  train_input_fn = input_fn_builder(
          input_files=train_input_files,
          max_seq_length=MAX_SEQ_LENGTH,
          max_predictions_per_seq=MAX_PREDICTIONS,
          is_training=True)

  estimator.train(input_fn=train_input_fn, steps=STEPS_PER_EPOCH)
  current_epoch+=1

  # For dynamic masking, a parallel data generation is used. This portion deletes the current data and 
  # updates the list of available data via a txt (to minimize interaction with GCS) so that the data 
  # generation algortihm can generate the data with different masking positions 
  cmd = "gsutil -m rm -r "+DATA_GCS_DIR_train
  !{cmd}
  current_available_indexes = open(TEMP_DIR+"/available_indexes.txt").read().split("\n")[:-1]
  print("current:",current_available_indexes)

  new_inds = ""
  for ind in current_available_indexes:
    if int(ind) != f:
      new_inds += ind +"\n"
  print("new_inds",new_inds)
  open(TEMP_DIR+"/available_indexes.txt","w+").write(new_inds)
  open(TEMP_DIR+"/epoch.txt","w+").write(str(current_epoch))
  upload_tmp_files(operating_files)

Copying gs://theodore_jiang/modified_large_temp/available_indexes.txt...
/ [1/1 files][    2.0 B/    2.0 B] 100% Done                                    
Operation completed over 1 objects/2.0 B.                                        
CommandException: No URLs matched: gs://theodore_jiang/modified_large_temp/epoch.txt
CommandException: 1 file/object could not be transferred.





EPOCH:0






trying to access training data from saved sector number 0
train_input_files: ['gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0000.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0001.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0002.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0003.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0004.tfrecord', 'gs://theodore_jiang/pretraining_data_1024_modified_large/0/shard_0005.tfrecord', 'gs://theodore_jiang/pretraining




CURRENT STEP: 1551312


2021-09-04 04:16:24,674 - tensorflow - INFO - Using checkpoint: gs://theodore_jiang/bert_model_modified_large/model.ckpt-1551312
2021-09-04 04:16:24,675 - tensorflow - INFO - Using 7 data shards for training
2021-09-04 04:16:24,682 - tensorflow - INFO - Using config: {'_model_dir': 'gs://theodore_jiang/bert_model_modified_large', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.117.27.218:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object

embedding shape: (4, 1024, 768)
embedding shape: (4, 1024, 768)










































































2021-09-04 04:16:29,642 - tensorflow - INFO - **** Trainable Variables ****
2021-09-04 04:16:29,643 - tensorflow - INFO -   name = bert/embeddings/word_embeddings:0, shape = (27, 768), *INIT_FROM_CKPT*
2021-09-04 04:16:29,644 - tensorflow - INFO -   name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
2021-09-04 04:16:29,645 - tensorflow - INFO -   name = bert/embeddings/position_embeddings:0, shape = (1024, 768), *INIT_FROM_CKPT*
2021-09-04 04:16:29,646 - tensorflow - INFO -   name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
2021-09-04 04:16:29,647 - tensorflow - INFO -   name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
2021-09-04 04:16:29,647 - tensorflow - INFO -   name = bert/embeddings/conv1d/kernel:0, shape = (3, 768, 768), *INIT_FROM_CKPT*
2021-09-04 04:16:29,648 - tensorflow - INFO -   name = bert/embeddings/conv1d/bias:0, shape = (768,), *INIT_FROM_CKPT*
2021-09-04 04:16:29,649 - tensorflow - INFO

logits Tensor("Softmax:0", shape=(80, 27), dtype=float32)
preds Tensor("ArgMax:0", shape=(80,), dtype=int32)
ids Tensor("Reshape_6:0", shape=(80,), dtype=int32)
ids1hot Tensor("Reshape_5:0", shape=(80, 27), dtype=float32)
weights Tensor("Reshape_4:0", shape=(80,), dtype=float32)
(80, 27) (80, 27) (80, 1)





acctot: Tensor("Sum_163:0", shape=(), dtype=float32)



2021-09-04 04:16:45,252 - tensorflow - INFO - Create CheckpointSaverHook.
2021-09-04 04:16:45,602 - tensorflow - INFO - Done calling model_fn.
2021-09-04 04:16:48,464 - tensorflow - INFO - TPU job name worker
2021-09-04 04:16:49,952 - tensorflow - INFO - Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
2021-09-04 04:16:50,123 - tensorflow - INFO - Restoring parameters from gs://theodore_jiang/bert_model_modified_large/model.ckpt-1551312
Instructions for updating:
Use standard file utilities to get mtimes.
2021-09-04 04:17:07,099 - tensorflow - INFO - Running local_init_op.
2021-09-04 04:17:08,067 - tensorflow - INFO - Done running local_init_op.
2021-09-04 04:17:18,215 - tensorflow - INFO - Saving checkpoints for 1551312 into gs://theodore_jiang/bert_model_modified_large/model.ckpt.
Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.
2021-09-04 04:17:44,315 - tensorflow - INFO - Initialized 

In [None]:
#randomspamfo