Note: If using a TPU from Google Cloud (not the Colab TPU), make sure to run this notebook on a VM with access to all GCP APIs, and make sure TPUs are enabled for the GCP project

This file can evaluate in parallel multiple models at the same time. However, if more frequent evaluations on more models are desired, run multiple copies of this notebook in multiple VMs

# Configure settings

In [None]:
#@markdown ### General Config
USE_GCP_TPU = False #@param {type:"boolean"}
MAX_SEQ_LENGTH =  1024 #@param {type:"integer"}
PROCESSES = 2 #@param {type:"integer"}
NUM_TPU_CORES = 8 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
#@markdown ###### The name of the models to be evaluated (must correspond to the names saved from the pretraining script) Note: if multiple models need to be evaluated at the same time: xxx is the placeholder for the individual model identifier (if only one is being evaluated xxx will only placehold for that single model)
MODEL_NAME_FORMAT = "bert_model_xxx" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data_1024" #@param {type:"string"}
EVAL_DIR = "eval_data_1024" #@param {type:"string"}
TESTING_DIR = "testing_data_1024" #@param {type:"string"}
#@markdown ###### Folder within EVAL_DIR for where evaluation results should be written to
RUN_NAME_format = "bert_model_xxx" #@param {type:"string"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}

#@markdown ### Evaluation procedure config
EVAL_TEST_BATCH_SIZE = 64 #@param {type:"integer"}
#@markdown ######When checking for newly trained models during evaluation, how long to wait between each check (to minimize interaction with GCS, should be around the same time it takes for the training script to get train and save 1 checkpoint)
CHECK_MODEL_EVERY_N_SECS = 600 #@param {type:"integer"}


#If running on a GCP TPU, use these commands prior to running this notebook

To ssh into the VM:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Make sure the port above matches the port below (in this case it's 8888)

```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser

(one command):sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
And then copy and paste the outputted link with "locahost: ..." into the colab connect to local runtime option


###Also run this code segment, which creates a TPU

In [None]:
GCE_PROJECT_NAME = "genome-project-319100" #@param {type:"string"}
TPU_ZONE = "us-central1-f" #@param {type:"string"}
TPU_NAME = "mutformer-tpu" #@param {type:"string"}

!gcloud alpha compute tpus create $TPU_NAME --accelerator-type=tpu-v2 --version=1.15.5 --zone=$TPU_ZONE ##create new TPU

!gsutil iam ch serviceAccount:`gcloud alpha compute tpus describe $TPU_NAME | grep serviceAccount | cut -d' ' -f2`:admin gs://theodore_jiang && echo 'Successfully set permissions!' ##give TPU access to GCS

#Clone the repo

In [None]:
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

#Imports/Authenticate for GCP

In [None]:
if not USE_GCP_TPU:
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_pretraining import input_fn_builder, model_fn_builder

  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

log.handlers = []
#@markdown ###### Whether or not to write logs to a file
DO_FILE_LOGGING = False #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown ###### If using file logging, what path to write logs to
  FILE_LOGGING_PATH = '/content/drive/My Drive/spam.log' #@param {type:"string"}
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

if USE_GCP_TPU:
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_NAME, zone=TPU_ZONE, project=GCE_PROJECT_NAME)
  TPU_ADDRESS = tpu_cluster_resolver.get_master()
  with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      # Upload credentials to TPU.
      tf.contrib.cloud.configure_gcs(session)
else:
  if 'COLAB_TPU_ADDR' in os.environ:
    log.info("Using TPU runtime")
    TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

    with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      # Upload credentials to TPU.
      with tf.gfile.Open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
      tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
      
  else:
    raise Exception('Not connected to TPU runtime, TPU required to run mutformer')

#Auto Detect amount of train steps per epoch in the source data/Mount Drive if needed

In [None]:
#@markdown ###### Note: for all of these, if using USE_GCP_TPU, all of these parameters must use GCS, because a GCP TPU can't access google drive
#@markdown \#@markdown ###### if not USE_GCP_TPU and data was stored in drive, folder where the original data was stored (if data was stored in GCS or USE_GCP_TPU is true, leave this item blank)
data_folder = "/content/drive/My Drive/BERT pretraining/mutformer_pretraining_data" #@param {type: "string"}
BUCKET_PATH = "gs://{}".format(BUCKET_NAME)

#@markdown whether to use GCS for writing eval results, if not, defaults to drive
GCS_EVAL = False #@param {type:"boolean"}
EVALS_PATH = BUCKET_PATH if GCS_EVAL else DRIVE_PATH

if not USE_GCP_TPU and "/content/drive" in data_folder:
  from google.colab import drive
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH = "/content/drive/My Drive"

  data_path_train = drive_data_folder+"/train.txt" 

  lines = tf.gfile.Open(data_path_train).read().split("\n")
  SEQUENCES_PER_EPOCH = len(lines)
  STEPS_PER_EPOCH = int(SEQUENCES_PER_EPOCH/TRAIN_BATCH_SIZE)

  print("sequences per epoch:",SEQUENCES_PER_EPOCH, "steps per epoch:",STEPS_PER_EPOCH)
else:
  from tqdm import tqdm
  def steps_getter(input_files):
    tot_sequences = 0
    for input_file in input_files:
      print("reading:",input_file)

      d = tf.data.TFRecordDataset(input_file)

      with tf.Session() as sess:
        tot_sequences+=sess.run(d.reduce(0, lambda x,_: x+1))

    return tot_sequences

  got_data = False
  while not got_data: ##will keep trying to access the data until available
    for f in range(0,DATA_COPIES):
        DATA_GCS_DIR_train = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR+"/"+str(f))
        train_input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR_train,'*tfrecord'))
        print("Using:",train_input_files)
        if len(train_input_files)>0:
          got_data = True
          try:
            SEQUENCES_PER_EPOCH = steps_getter(train_input_files)
            STEPS_PER_EPOCH = int(SEQUENCES_PER_EPOCH/TRAIN_BATCH_SIZE)
            print("sequences per epoch:",SEQUENCES_PER_EPOCH, "steps per epoch:",STEPS_PER_EPOCH)
            break
          except:
            got_data=False
    if got_data:
      break
    print("Could not find data, waiting for data generation...trying again in another "+str(CHECK_DATA_EVERY_N_SECS)+" seconds.")
    time.sleep(CHECK_MODEL_EVERY_N_SECS)

#Evaluation

###Evaluation operation definition

In [None]:
def reload_ckpt(model_dir,logging_dir,current_ckpt,model,data_dir):
  BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, model_dir)

  CONFIG_FILE = os.path.join(BERT_GCS_DIR, "config.json")

  INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)
  print("init chkpt:",INIT_CHECKPOINT)
  print("current chkpt:",current_ckpt)
  if INIT_CHECKPOINT != current_ckpt:
    config = modeling.BertConfig.from_json_file(CONFIG_FILE)
    test_input_files = tf.gfile.Glob(os.path.join(data_dir,'*tfrecord'))
    log.info("Using {} data shards for testing".format(len(test_input_files)))
    model_fn = model_fn_builder(
          bert_config=config,
          init_checkpoint=INIT_CHECKPOINT,
          init_learning_rate=0,
          decay_per_step=0,
          num_warmup_steps=10,
          use_tpu=True,
          use_one_hot_embeddings=True,
          bert=model)

    
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=BERT_GCS_DIR,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=NUM_TPU_CORES,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=True,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=1,
        eval_batch_size=EVAL_TEST_BATCH_SIZE)
    
    input_fn = input_fn_builder(
        input_files=test_input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=False)
    return INIT_CHECKPOINT,estimator,input_fn,True
  else:
    return None,None,None,False

###Run Eval

In [None]:
#@markdown ###### whether to evaluate on the test set or the dev set (value can be "test" or "dev")
dataset = "test" #@param{type:"string"}
#@markdown ###### whether to continuously evaluate in a while loop
REPEAT_EVAL = True #@param{type:"boolean"}
#@markdown what folder to write evaluation results into 
EVALUATIONS_DIR = "bert_model_pretraining_loss_spam" #@param {type:"string"}


if dataset=="test":
  DATA_DIR = TESTING_DIR
elif dataset=="dev":
  DATA_DIR = EVAL_DIR
else:
  raise Exception("only datasets supported are dev and test")

#@markdown ######if running multiple models in parallel, which model identifiers to evaluate (Make sure to indicate the model architecture corresponding to teach model identifier in the dictionary in the code below)
models_to_evaluate = ["modified_large_v2"] #@param #list of models to evaluate


### vvv CHANGE THIS vvv

name2model = {      ##dictionary mapping model architecture to each model name
    "modified":BertModelModified,
    "modified_medium":BertModelModified,
    "modified_large":BertModelModified,
    "modified_largev2":MutFormer,
    "orig":BertModel,
    "large":BertModel
}

### ^^^ CHANGE THIS ^^^


def write_metrics(metrics,dir):
  gs = metrics["global_step"]
  print("global step",gs)

  tf.disable_eager_execution()
  tf.reset_default_graph()  
  for key,value in metrics.items():
    print(key,value)
    x_scalar = tf.constant(value)
    first_summary = tf.summary.scalar(name=key, tensor=x_scalar)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(dir)
        sess.run(init)
        summary = sess.run(first_summary)
        writer.add_summary(summary, gs)
        writer.flush()
        print('Done with writing the scalar summary')
    time.sleep(1)
  if not os.path.exists(EVALS_PATH+"/"+dir):
    os.makedirs(EVALS_PATH+"/"+dir)
  if "gs:" in EVALS_PATH:
    cmd = "gsutil cp -r \""+dir+"/.\" \""+EVALS_PATH+"/"+dir+"\""
  else:
    cmd = "cp -r \""+dir+"/.\" \""+EVALS_PATH+"/"+dir+"\""
  !{cmd}

current_ckpts = ["N/A" for i in range(len(models_to_evaluate))]

total_metrics = {}

while True:
  for n,model in enumerate(models_to_evaluate):
    MODEL_DIR = MODEL_NAME_FORMAT.replace("xxx",model)
    LOCAL_EVALUATIONS_DIR = "{}/{}".format(EVALUATIONS_DIR,RUN_NAME_format.replace("xxx",model))
    current_ckpt = current_ckpts[n]
    current_ckpt,estimator,test_input_fn,new = reload_ckpt(MODEL_DIR,GCS_LOGGING_DIR,current_ckpt,name2model[model],BUCKET_PATH+"/"+DATA_DIR)
    current_ckpts[n] = current_ckpt
    if new:
      print("\n\nEVALUATING "+model+" MODEL\n\n")
      log.info("Using checkpoint: {}".format(current_ckpt))
      metrics = estimator.evaluate(input_fn=test_input_fn, steps=(TEST_STEPS if dataset=="test" else EVAL_STEPS))
      if REPEAT_EVAL:
        write_metrics(metrics,LOCAL_EVALUATIONS_DIR)
      else:
        total_metrics[LOCAL_EVALUATIONS_DIR] = metrics

  print("finished 1 eval loop")
  if not REPEAT_EVAL:
    break
  time.sleep(CHECK_MODEL_EVERY_N_SECS)
if dataset == "test":
  for logging_dir,metrics in total_metrics.items():
    print("Printing metrics for:",logging_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")