#Finetuning Evaluation and Prediction Script

This notebook evlauates and performs predictions on test data using finetuned models.

# Configure settings

In [None]:
#@markdown ## General Config
#@markdown In the case that an inference database is large and a long duration of continuous runtime is required, a GCP TPU/runtime to run this notebook may be desirable. If that's the case, specify here:
GCP_RUNTIME = False #@param {type:"boolean"}
#@markdown How many TPU scores the TPU has: if using colab, NUM_TPU_CORES is 8.
NUM_TPU_CORES = 8 #@param {type:"number"}
#@markdown Which mode to use (a different mode means a different finetuning task): options are:
#@markdown * "MRPC" - paired sequence method
#@markdown * "MRPC_w_ex_data" - paired sequence method with external data
#@markdown * "RE" - single sequence method
#@markdown * "NER" - single sequence per residue prediction 
#@markdown 
#@markdown You can add more modes by creating a new processor and/or a new model_fn inside of the "mutformer_model_code" folder downloaded from github, then changing the corresponding code snippets in the code segment named "Authorize for GCS, Imports, and General Setup" (also edit the dropdown below).
MODE = "RE" #@param   ["MRPC_w_ex_data", "MRPC", "RE", "NER"]   {type:"string"} 
                        ####      ^^^^^ dropdown list for all modes ^^^^^
#@markdown Name of the GCS bucket to use:
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://"+BUCKET_NAME
#@markdown Where the processed data was stored in GCS:
PROCESSED_DATA_DIR = "RE_finetune_update_loaded" #@param {type:"string"}
#@markdown What folder to write predictions into (location of this folder will either be GCS or google drive) (the PREDICTIONS_FOLDER variable can be the same across all finetuning notebooks):
PREDICTIONS_FOLDER = "MutFormer_updated_finetuning_predictions" #@param {type:"string"}
#@markdown What folder to write evaluation results into (location of this folder will either be GCS or google drive) EVALUATIONS_FOLDER variable can be the same across all finetuning notebooks):
EVALUATIONS_FOLDER = "MutFormer_updated_finetuning_eval_results" #@param {type:"string"}


#If running on a GCP runtime, follow these instructions to set it up

###1) Create a VM from the GCP website
###2) Open a command prompt on your computer and perform the following steps"
To ssh into the VM, run:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Note: Make sure the port above matches the port below (in this case it's 8888)
\
\
In the new command prompt that popped out, either run each of the commands below individually, or copy and paste the one liner below:
```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
One command:
```
sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
###3) In this notebook, click the "connect to local runtime" option under the connect button, and copy and paste the link outputted by command prompt with "locahost: ..."
###4) Finally, run this code segment, which creates a TPU


In [None]:
GCE_PROJECT_NAME = "genome-project-319100" #@param {type:"string"}
TPU_ZONE = "us-central1-f" #@param {type:"string"}
TPU_NAME = "mutformer-tpu" #@param {type:"string"}

!gcloud alpha compute tpus create $TPU_NAME --accelerator-type=tpu-v2 --version=1.15.5 --zone=$TPU_ZONE ##create new TPU

!gsutil iam ch serviceAccount:`gcloud alpha compute tpus describe $TPU_NAME | grep serviceAccount | cut -d' ' -f2`:admin gs://theodore_jiang && echo 'Successfully set permissions!' ##give TPU access to GCS

[1;31mERROR:[0m (gcloud.alpha.compute.tpus.create) Error parsing [tpu].
The [tpu] resource is not properly specified.
Failed to find attribute [project]. The attribute can be set in the following ways: 
- provide the argument `--project` on the command line
- set the property `core/project`
[1;31mERROR:[0m (gcloud.alpha.compute.tpus.describe) Error parsing [tpu].
The [tpu] resource is not properly specified.
Failed to find attribute [project]. The attribute can be set in the following ways: 
- provide the argument `--project` on the command line
- set the property `core/project`
ServiceException: 401 Anonymous caller does not have storage.buckets.getIamPolicy access to the Google Cloud Storage bucket.


#Clone the MutFormer repo

In [None]:
if GCP_RUNTIME:
  !sudo apt-get -y install git
#@markdown Where to clone the repo into:
REPO_DESTINATION_PATH = "mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

Cloning into 'mutformer'...
remote: Enumerating objects: 614, done.[K
remote: Counting objects: 100% (415/415), done.[K
remote: Compressing objects: 100% (335/335), done.[K
remote: Total 614 (delta 299), reused 111 (delta 78), pack-reused 199[K
Receiving objects: 100% (614/614), 2.14 MiB | 12.64 MiB/s, done.
Resolving deltas: 100% (410/410), done.


#Authorize for GCS, Imports, and General Setup

In [None]:
if not GCP_RUNTIME:
  %tensorflow_version 1.x
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

import sys
import json
import random
import logging
import tensorflow.compat.v1 as tf
import time
import importlib
import os
import shutil

if REPO_DESTINATION_PATH == "mutformer":
  if os.path.exists("mutformer_code"):
    shutil.rmtree("mutformer_code")
  shutil.copytree(REPO_DESTINATION_PATH,"mutformer_code")
  REPO_DESTINATION_PATH = "mutformer_code"
if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic  #### <<<<< if you added more modes, change these imports to import the correct processors, 
from mutformer.modeling import BertModel,BertModelModified                                        #### <<<<< correct training scripts (i.e. run_classifier and run_ner_for_pathogenic), and
from mutformer.run_classifier import MrpcProcessor,REProcessor,MrpcWithExDataProcessor            #### <<<<< correct model classes
from mutformer.run_ner_for_pathogenic import NERProcessor  

##reload modules so that you don't need to restart the runtime to reload modules in case that's needed
modules2reload = [modeling, 
                  optimization, 
                  tokenization,
                  run_classifier,
                  run_ner_for_pathogenic]
for module in modules2reload:
    importlib.reload(module)

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#@markdown ###### Whether or not to write logs to a file
DO_FILE_LOGGING = False #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown ###### If using file logging, what path to write logs to
  FILE_LOGGING_PATH = 'file_logging/spam.log' #@param {type:"string"}
  if not os.path.exists("/".join(FILE_LOGGING_PATH.split("/")[:-1])):
    os.makedirs("/".join(FILE_LOGGING_PATH.split("/")[:-1]))
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

if GCP_RUNTIME:
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_NAME, zone=TPU_ZONE, project=GCE_PROJECT_NAME)
  TPU_ADDRESS = tpu_cluster_resolver.get_master()
  with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      tf.contrib.cloud.configure_gcs(session)
else:
  if 'COLAB_TPU_ADDR' in os.environ:
    log.info("Using TPU runtime")
    TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

    with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      # Upload credentials to TPU.
      with open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
      tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
      
  else:
    raise Exception('Not connected to TPU runtime, TPU required to run mutformer')


if MODE=="MRPC":      ####       vvvvv if you added more modes, change this part to set the processors and training scripts correctly vvvvv
  processor = MrpcProcessor()
  script = run_classifier
  USING_EX_DATA = False
elif MODE=="MRPC_w_ex_data":
  processor = MrpcWithExDataProcessor()
  script = run_classifier
  USING_EX_DATA = True
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
  USING_EX_DATA = False
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
  USING_EX_DATA = False
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\",\"MRPC_w_ex_data\" \"RE\",\"NER\"].")
label_list = processor.get_labels()
                      ####       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


TensorFlow 1.x selected.
Authorize for GCS:
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Authorize done



2022-01-08 01:16:27,854 - tensorflow - INFO - Using TPU runtime
2022-01-08 01:16:27,857 - tensorflow - INFO - TPU address is grpc://10.94.154.10:8470


#Specify location preferences for google drive vs GCS/Mount Drive if needed



In [None]:
#@markdown ###### Note: For all of these, if using GCP_RUNTIME, all of these parameters must use GCS, because a GCP TPU can't access google drive
#@markdown \
#@markdown If original data was stored in drive (this variable should match up with the "INPUT_DATA_DIR" variable in the data generation script), full drive path to the original data (for detecting the # of steps per epoch) (this variable is used to limit interaction with GCS; it can also be left blank and steps will be automatically detected from tfrecords stored in GCS):
#@markdown * If GCP_RUNTIME, drive paths will not work, so steps detection will automatically default to tfrecords
ORIG_DATA_FOLDER = "" #@param {type: "string"}
DRIVE_PATH = "/content/drive/My Drive"
#@markdown Whether to use GCS for writing predictions, if not, defaults to drive
GCS_PREDICTIONS = True #@param {type:"boolean"}
#@markdown Whether to use GCS for writing eval results, if not, defaults to drive
GCS_EVAL = True #@param {type:"boolean"}

PREDS_PATH = BUCKET_PATH if GCS_PREDICTIONS else DRIVE_PATH
EVALS_PATH = BUCKET_PATH if GCS_EVAL else DRIVE_PATH

if GCP_RUNTIME:
  FILES_PATH = BUCKET_PATH

if ("/content/drive" in ORIG_DATA_FOLDER and not GCP_RUNTIME) or not GCS_PREDICTIONS or not GCS_EVAL:
  from google.colab import drive,auth
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)
  






# Run Eval/prediction

This following section will perform evaluation and prediction on either the eval dataset or the test dataset.

###General Setup and definitions

In [None]:
#@markdown When performing prediction, whether or not to ensure all datapoints are predicted via a trailing test dataset: (if so, make sure this option was also specified as True during data generation)
PRECISE_TESTING = True #@param {type:"boolean"}
#@markdown Maximum batch size the runtime can handle during prediction without OOM for all models being evaluated/tested: note that this value should match up with the variable "MAX_BATCH_SIZE" in the data generation script.
MAX_BATCH_SIZE =  512 #@param {type:"integer"}

def latest_checkpoint(dir):
  cmd = "gsutil ls "+dir
  files = !{cmd}
  for file in files:
    if "model.ckpt" in file:
      return file.replace("."+file.split(".")[-1],"")

def write_metrics(metrics,dir):
  tf.logging.info("writing metrics to "+dir)
  if os.path.exists(dir):
    shutil.rmtree(dir)
  os.makedirs(dir)
  gs = metrics["global_step"]
  tf.logging.info("global step "+str(gs))

  tf.compat.v1.disable_eager_execution()
  tf.reset_default_graph()
  for key,value in metrics.items():
    tf.logging.info(str(key)+":"+str(value))
    x_scalar = tf.constant(value)
    first_summary = tf.summary.scalar(name=key, tensor=x_scalar)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(dir)
        sess.run(init)
        summary = sess.run(first_summary)
        writer.add_summary(summary, gs)
        writer.flush()
        tf.logging.info("Done with writing the scalar summary")
    time.sleep(1)

  if GCS_EVAL:
    cmd = "gsutil -m cp -r \""+dir+"/.\" \""+EVALS_PATH+"/"+dir+"\""
    !{cmd}  
  else:
    if not os.path.exists(EVALS_PATH+"/"+dir):
      os.makedirs(EVALS_PATH+"/"+dir)
    shutil.copytree(dir,EVALS_PATH+"/"+dir)
  

def write_predictions(PREDICTIONS_DIR,
                      result,
                      result_trailing,
                      shard_id=""):
  if not os.path.exists(PREDS_PATH+"/"+PREDICTIONS_DIR):
    os.makedirs(PREDS_PATH+"/"+PREDICTIONS_DIR)
  with tf.gfile.Open(PREDS_PATH+"/"+PREDICTIONS_DIR+"/predictions"+shard_id+".txt", "w") as writer:
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
      output_line = "\t".join([str(k)+":"+str(v) for k,v in prediction.items()]) + "\n"
      writer.write(output_line)
    if result_trailing:
      for (i, prediction) in enumerate(result_trailing):
        output_line = "\t".join([str(k)+":"+str(v) for k,v in prediction.items()]) + "\n"
        writer.write(output_line)


def evaluation_loop(RUN_EVAL,
                    RUN_PREDICTION,
                    EVALUATE_WHILE_PREDICT,
                    dataset,
                    MODEL,
                    total_metrics,
                    MAX_SEQ_LENGTH,
                    current_ORIG_DATA_FOLDER,
                    BERT_GCS_DIR,
                    USE_LATEST,
                    CHECKPOINT_STEP,
                    DATA_GCS_DIR,
                    USING_SHARDS,
                    START_SHARD,
                    USING_EX_DATA,
                    PRED_NUM,
                    EVAL_WHILE_PREDICT_PREDICTIONS_DIR,
                    PREDICTIONS_DIR,
                    EVALUATIONS_DIR,
                    CONFIG_FILE):

  try: ##wrap everything in a giant try except so that any 
       ##glitches won't completely stop evaluation in the middle
    current_ckpt = ""

    tf.logging.info("Using data from: "+DATA_GCS_DIR)
    tf.logging.info("Loading model from: "+BERT_GCS_DIR)

    eval_file = os.path.join(DATA_GCS_DIR, evaluating_file)

    def steps_getter(input_files):
      tot_sequences = 0
      for input_file in input_files:
        tf.logging.info("reading:"+input_file+" for steps")

        d = tf.data.TFRecordDataset(input_file)

        with tf.Session() as sess:
          tot_sequences+=sess.run(d.reduce(0, lambda x,_: x+1))

      return tot_sequences

    if USING_SHARDS:
      shards_folder = DATA_GCS_DIR
      input_file = os.path.join(DATA_GCS_DIR, evaluating_file)
      import re
      file_name = input_file.split("/")[-1]
      shards = [shards_folder + "/" + file for file in tf.io.gfile.listdir(shards_folder) if
                re.match(file_name + "_\d+", file)]
      shards = sorted(shards,key=lambda shard:int(shard.split("_")[-1]))[START_SHARD:]
    else:
      shards = [eval_file]

    if USING_SHARDS:
      tf.logging.info("\nUSING SHARDs:")
      for shard in shards:
        tf.logging.info(shard)
      tf.logging.info("\n")

    if RUN_EVAL:   
      try:
        if dataset=="dev":
          data_path_eval = "/content/drive/My Drive/"+current_ORIG_DATA_FOLDER+"/dev.tsv"
        else:
          data_path_eval = "/content/drive/My Drive/"+current_ORIG_DATA_FOLDER+"/test.tsv"
        lines = open(data_path_eval).read().split("\n")
        EVAL_STEPS = int(len(lines)/EVAL_BATCH_SIZE)
      except Exception:
        SEQUENCES_PER_EPOCH = steps_getter(shards)
        EVAL_STEPS = int(SEQUENCES_PER_EPOCH/EVAL_BATCH_SIZE)
      else: ##dataset=="test"
        SEQUENCES_PER_EPOCH = steps_getter(shards)
        EVAL_STEPS = int(SEQUENCES_PER_EPOCH/EVAL_BATCH_SIZE)

      tf.logging.info("eval steps:"+str(EVAL_STEPS))

    
    if EVALUATE_WHILE_PREDICT:
      cmd = "gsutil -m rm -r "+EVAL_WHILE_PREDICT_PREDICTIONS_DIR
      !{cmd}
    if USE_LATEST:
      try:
        RESTORE_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)
      except Exception:
        RESTORE_CHECKPOINT = latest_checkpoint(BERT_GCS_DIR)
    else:
      try:
        latest_ckpt = tf.train.latest_checkpoint(BERT_GCS_DIR).split("/")[-1]
        RESTORE_CHECKPOINT = [".".join(ckpt.split(".")[:-1]) 
                              for ckpt in tf.io.gfile.listdir(BERT_GCS_DIR) 
                              if len(ckpt.split("."))==3 and str(CHECKPOINT_STEP) == ckpt.split(".")[-2].split("-")[-1]][0]
        old_file_lines = tf.gfile.Open(BERT_GCS_DIR+"/checkpoint").read().split("\n")
        new_file_lines = old_file_lines.copy()
        new_file_lines[0] = new_file_lines[0].replace(latest_ckpt,RESTORE_CHECKPOINT)
        RESTORE_CHECKPOINT = BERT_GCS_DIR+"/"+RESTORE_CHECKPOINT

        tf.gfile.Open(BERT_GCS_DIR+"/checkpoint","w+").write("\n".join(new_file_lines))
      except Exception as e:
        tf.logging.info("\n\nCould not find the checkpoint specified. Error:"+str(e)+". Skipping...\n\n")
        return False,total_metrics,current_ckpt
    try:
      current_ckpt=RESTORE_CHECKPOINT
      tf.logging.info("USING CHECKPOINT:"+RESTORE_CHECKPOINT)
    except Exception:
      raise Exception("No checkpoints were found in the given location")
    config = modeling.BertConfig.from_json_file(CONFIG_FILE)

    model_fn = script.model_fn_builder(
        bert_config=config,
        num_labels=len(label_list),
        init_checkpoint=None,
        restore_checkpoint=RESTORE_CHECKPOINT,
        init_learning_rate=0,
        decay_per_step=0,
        num_warmup_steps=10,
        use_tpu=True,
        use_one_hot_embeddings=True,
        bert=MODEL,
        test_results_dir=EVAL_WHILE_PREDICT_PREDICTIONS_DIR,
        yield_predictions=EVALUATE_WHILE_PREDICT,
        using_ex_data=USING_EX_DATA)

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=BERT_GCS_DIR,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=NUM_TPU_CORES,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=True,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=1,
        eval_batch_size=EVAL_BATCH_SIZE,
        predict_batch_size=MAX_BATCH_SIZE)
      
    tf.logging.info("USING FILE:"+eval_file)
    for n,shard in enumerate(shards):
      input_fn = script.file_based_input_fn_builder(
            input_file=shard,
            seq_length=MAX_SEQ_LENGTH,
            is_training=False,
            drop_remainder=True,
            pred_num=PRED_NUM if USING_EX_DATA else None)


      tf.logging.info("***** Running evaluation/prediction *****")
      tf.logging.info(" Eval Batch size = "+str(EVAL_BATCH_SIZE))
      tf.logging.info(" Predict Batch size = "+str(MAX_BATCH_SIZE))
      
    
      if RUN_EVAL:
        eval_metrics = estimator.evaluate(input_fn=input_fn, steps=EVAL_STEPS)
        tf.logging.info("\n\n\n\n\n\nEVAL METRICS:")
        for k,v in eval_metrics.items():
          tf.logging.info(k+":"+str(v))
        tf.logging.info("\n\n\n\n\n\n\n")
        write_metrics(eval_metrics,EVALUATIONS_DIR)
        if not REPEAT_LOOP:
          total_metrics[EVALUATIONS_DIR] = eval_metrics
      if RUN_PREDICTION:
        result=estimator.predict(input_fn=input_fn)
        if PRECISE_TESTING and n==len(shards)-1:
          run_config_trailing = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=BERT_GCS_DIR,
            tpu_config=tf.contrib.tpu.TPUConfig(
                num_shards=1,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

          estimator_trailing = tf.contrib.tpu.TPUEstimator(
              use_tpu=True,
              model_fn=model_fn,
              config=run_config_trailing,
              train_batch_size=1,
              predict_batch_size=1)
          test_file_trailing = os.path.join(DATA_GCS_DIR, "test_trailing.tf_record")
          test_input_fn_trailing = script.file_based_input_fn_builder(
              input_file=test_file_trailing,
              seq_length=MAX_SEQ_LENGTH,
              is_training=False,
              drop_remainder=True,
              pred_num=PRED_NUM if USING_EX_DATA else None)
          result_trailing=estimator_trailing.predict(input_fn=test_input_fn_trailing)
        else:
          result_trailing = None
        write_predictions(PREDICTIONS_DIR,
                          result,
                          result_trailing,
                          shard_id=str(START_SHARD+n)if USING_SHARDS else "")
    if not USE_LATEST:
      tf.gfile.Open(BERT_GCS_DIR+"/checkpoint","w+").write("\n".join(old_file_lines))
    return True,total_metrics,current_ckpt
  except Exception as e:
    tf.logging.info("\n\nFAILED-error:"+str(e)+". Skipping...\n\n")
    return False,total_metrics,current_ckpt
  

Following are three different code segments to run:
1. For if you benchmarked model/sequence length during finetuning and wish to evaluate each model \
2. For if you benchmarked sequence length/batch size during finetuning and wish to evaluate each model \
3. For only evaluating/predicting using a single model

Choose a desired code segment to run, select the desired options for evaluating/predicting, and run that code segment
\
\
Note: All evaluation results will be written into the previously specified logging directory either under google drive or GCS, depending on the values of GCS_COMS, GCS_PREDICTIONS, and GCS_EVAL specified before. To view the results, use the colab notebook titled "mutformer processing and viewing finetuning results," which can also be used to view prediction results

###Model/Sequence Length

In [None]:
#@markdown ### IO config
#@markdown Folder for where to load the finetuned model from
FINETUNED_MODEL_DIR = "bert_model_re_mn_sl_try8" #@param {type:"string"}
#@markdown Which folder inside of PREDICTIONS_DIR and EVALUATIONS_DIR to write predictions and evaluations, respectively, into:
RUN_NAME = "RE_updated_mn_sl_try8" #@param {type:"string"}
#@markdown \
#@markdown 
#@markdown 
#@markdown ### Evaluation/prediction procedure config
#@markdown The evaluation loop will loop through a list of models and a list of sequence lengths, attempting to evaluate a finetuned model for each combination of pretrained model and sequence length (failed combinations will be skipped).
#@markdown * List of pretrained models that were used for finetuning (should indicate the names of the model folders inside INIT_MODEL_DIR from the finetuning training script):
MODELS = ["MutBERT10L","MutBERT8L","MutFormer8L"] #@param
#@markdown * List of model architectures for each model in the "MODELS" list defined in the entry above: each position in this list must correctly indicate the model architecture of its corresponding model folder in the list "MODELS" (BertModel indicates the original BERT, BertModelModified indicates MutFormer's architecture).
MODEL_ARCHITECTURES = [BertModel,BertModel,BertModelModified] #@param
#@markdown * List of sequence lengthed models to test
MAX_SEQ_LENGTHS = [1024,512,256,128,64] #@param
#@markdown Whether to evaluate on the test set or the dev set ("test" or "dev")
dataset = "test" #@param{type:"string"}
#@markdown Whether or not to run evaluation
RUN_EVAL = False #@param {type:"boolean"}
#@markdown Whether or not to run prediction (in a seperate loop from evaluation; EVALUATE_WHILE_PREDICT will override this value to False)
RUN_PREDICTION = True #@param {type:"boolean"}
#@markdown Whether or not to repeat this operation in a loop (if performing parallel evaluation operation, set to True, False otherwise)
#@markdown * If using REPEAT_LOOP, to prevent the script from evaluating every single model trained on every single combination of batch size and sequence length every loop, the script will only evaluate models that are being currently trained (the script will only evaluate on the model folders that have seen a new latest checkpoint since the script started running).
REPEAT_LOOP = False #@param {type:"boolean"}
#@markdown When using REPEAT_LOOP, how long to wait in between each loop before checking again for updated train progress:
CHECK_MODEL_EVERY_N_SECS =  150#@param {type:"integer"}
#@markdown If evaluating, whether or not to evaluate and predict results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is significant (if yes, prediction results will be written in the form of tfevent files into GCS that need to be viewed using the notebook titled "mutformer processing and viewing finetuning results")
#@markdown 
#@markdown Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown What batch size to use during evaluation (larger batch size will increase evaluation speed but may skip more datapoints)
EVAL_BATCH_SIZE = 64 #@param {type:"integer"}
#@markdown Whether or not testing/evaluating data was generated in shards
USING_SHARDS = False #@param {type:"boolean"}
#@markdown * If using shards, which shard index to start at (defualt 0 for first shard) (script will not delete older predictions, only continue generating predictions starting with this position):
START_SHARD = 0 #@param {type:"integer"}
#@markdown Whether to use the latest checkpoint in the folder (set to false if an intermediate checkpoint should be used)
USE_LATEST = True #@param {type:"boolean"}
#@markdown * If not using latest checkpoint, which step's checkpoint to use
CHECKPOINT_STEP = 4000 #@param {type:"integer"}

total_metrics = {}  ## a dictionary for all metrics to  
                    ## print at the end during testing, 
                    ## not necessary during evaluation   
if dataset=="test":
  evaluating_file = "test.tf_record"
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
else:
  raise Exception("only datasets supported are dev and test")

DATA_INFOS = [["N/A" for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS]   ##create an empty 2D list to store all
              for MODEL_NAME in MODELS]                  ##the data info dictionaries

current_ckpts = [["N/A" for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS] for MODEL_NAME in MODELS]
for M,MODEL_NAME in enumerate(MODELS):
  for m,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):
        BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
        try:
          current_ckpts[M][m] = tf.train.latest_checkpoint(BERT_GCS_DIR)
        except:
          try:
            current_ckpts[M][m] = latest_checkpoint(BERT_GCS_DIR)
          except:
            raise Exception("could not find any checkpoints in the model dir specified")

def get_new_ckpts(current_ckpts):
  new_ckpts = []
  for M,MODEL_NAME in enumerate(MODELS):
    for m,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):
          BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
          try:
            current_ckpt = tf.train.latest_checkpoint(BERT_GCS_DIR)
            if current_ckpts[M][m]!=current_ckpt:
              new_ckpts.append([M,m])
          except:
            try:
              current_ckpt = latest_checkpoint(BERT_GCS_DIR)
              if current_ckpts[M][m]!=current_ckpt:
                new_ckpts.append([M,m])
            except:
              raise Exception("could not find any checkpoints in the model dir specified")
  return new_ckpts

while True:
  sleeping = True   ##to prevent excessive interaction with GCS, 
                    ##if an eval/pred loop fails, the script 
                    ##will wait for a while before trying again

  if REPEAT_LOOP:                             ##if using REPEAT_LOOP, only evaluate on new checkpoints
    new_ckpts = get_new_ckpts(current_ckpts)
    if len(new_ckpts) == 0:
      tf.logging.info("No new checkpoints have been written since script start/last evaluation. Trying again in another "+str(CHECK_MODEL_EVERY_N_SECS)+" seconds.")

  for M,MODEL_NAME in enumerate(MODELS):
    for m,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):

      if REPEAT_LOOP:
        if [M,m] not in new_ckpts:
          continue

      tf.logging.info("\n\n\nMODEL NAME:"+MODEL_NAME+
            "\nINPUT MAX SEQ LENGTH:"+str(MAX_SEQ_LENGTH))
      
      MODEL = MODEL_ARCHITECTURES[M]
      current_ORIG_DATA_FOLDER= ORIG_DATA_FOLDER+"/"+str(MAX_SEQ_LENGTH)

      BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
      DATA_GCS_DIR = BUCKET_PATH+"/"+PROCESSED_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)
          
      EVAL_WHILE_PREDICT_PREDICTIONS_DIR = BUCKET_PATH+"/"+PREDICTIONS_FOLDER+"/"+RUN_NAME+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
      EVALUATIONS_DIR = EVALUATIONS_FOLDER+"/"+RUN_NAME+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
      PREDICTIONS_DIR = PREDICTIONS_FOLDER+"/"+RUN_NAME+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
      CONFIG_FILE = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)+"/config.json"
      
      if DATA_INFOS[M][m] == "N/A":
        DATA_INFOS[M][m] = json.load(tf.gfile.Open(DATA_GCS_DIR+"/info.json"))
      
      EX_DATA_NUM = DATA_INFOS[M][m]["ex_data_num"] if USING_EX_DATA else 0


      ##run the evaluation/prediction loop
      sucess,total_metrics,current_ckpt = \
              evaluation_loop(RUN_EVAL,
                              RUN_PREDICTION,
                              EVALUATE_WHILE_PREDICT,
                              dataset,
                              MODEL,
                              total_metrics,
                              MAX_SEQ_LENGTH,
                              current_ORIG_DATA_FOLDER,
                              BERT_GCS_DIR,
                              USE_LATEST,
                              CHECKPOINT_STEP,
                              DATA_GCS_DIR,
                              USING_SHARDS,
                              START_SHARD,
                              USING_EX_DATA,
                              EX_DATA_NUM,
                              EVAL_WHILE_PREDICT_PREDICTIONS_DIR,
                              PREDICTIONS_DIR,
                              EVALUATIONS_DIR,
                              CONFIG_FILE)

      current_ckpts[M][m] = current_ckpt
      if sucess:
        sleeping = False
  time.sleep(CHECK_MODEL_EVERY_N_SECS if sleeping else 0)
  if not REPEAT_LOOP:
    break
if not REPEAT_LOOP and RUN_EVAL:
  tf.logging.info("Printing all metrics...\n\n")
  for evals_dir,metrics in total_metrics.items():
    tf.logging.info("Printing metrics for:"+evals_dir+"\n")
    for key,metric in metrics.items():
      tf.logging.info(key+":"+str(metric))
    tf.logging.info("\n")


###Batch Size/Sequence Length

In [None]:
#@markdown ### IO config
#@markdown Folder for where to load the finetuned model from
FINETUNED_MODEL_DIR = "bert_model_mrpc_adding_preds_only_others_xxx" #@param {type:"string"}
#@markdown Name of the folder to the finetuned model to load from inside FINETUNED_MODEL_DIR
MODEL_NAME="bert_model_modified_large" #@param {type:"string"}
#@markdown Model architecture to use BertModel indicates the original BERT, BertModelModified indicates MutFormer's architecture
MODEL_ARCHITECTURE = BertModelModified #@param
#@markdown What folder to write predictions into (location of this folder will either be GCS or google drive)
PREDICTIONS_FOLDER = "added_preds_only_others" #@param {type:"string"}
#@markdown What folder to write evaluation results into (location of this folder will either be GCS or google drive)
EVALUATIONS_FOLDER = "added_preds_only_others" #@param {type:"string"}
#@markdown Which folder inside of PREDICTIONS_DIR and EVALUATIONS_DIR to write predictions and evaluations, respectively, into:
RUN_NAME = "MRPC_adding_preds_w_mutformer12L" #@param {type:"string"}
#@markdown \
#@markdown 
#@markdown 
#@markdown ### Evaluation/prediction procedure config
#@markdown The evaluation loop will loop through a list of batch sizes and a list of sequence lengths, attempting to evaluate a finetuned model for each combination of batch size and sequence length (failed combinations will be skipped).
#@markdown * List of batch sized models to test
BATCH_SIZES = [32,16,64] #@param
#@markdown * List of sequence lengthed models to test
MAX_SEQ_LENGTHS = [256,512,1024] #@param
#@markdown Whether to evaluate on the test set or the dev set ("test" or "dev")
dataset = "test" #@param{type:"string"}
#@markdown Whether or not to run evaluation
RUN_EVAL = True #@param {type:"boolean"}
#@markdown Whether or not to run prediction (in a seperate loop from evaluation; EVALUATE_WHILE_PREDICT will override this value to False)
RUN_PREDICTION = False #@param {type:"boolean"}
#@markdown Whether or not to repeat this operation in a loop (if performing parallel evaluation operation, set to True, False otherwise)
#@markdown * If using REPEAT_LOOP, to prevent the script from evaluating every single model trained on every single combination of batch size and sequence length every loop, the script will only evaluate models that are being currently trained (the script will only evaluate on the model folders that have seen a new latest checkpoint since the script started running).
REPEAT_LOOP = True #@param {type:"boolean"}
#@markdown When using REPEAT_LOOP, how long to wait in between each loop before checking again for updated train progress:
CHECK_MODEL_EVERY_N_SECS =  300#@param {type:"integer"}
#@markdown If evaluating, whether or not to evaluate and predict results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is significant (if yes, prediction results will be written in the form of tfevent files into GCS that need to be viewed using the notebook titled "mutformer processing and viewing finetuning results")
#@markdown 
#@markdown Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown What batch size to use during evaluation (larger batch size will increase evaluation speed but may skip more datapoints)
EVAL_BATCH_SIZE = 64 #@param {type:"integer"}
#@markdown Whether or not testing/evaluating data was generated in shards
USING_SHARDS = False #@param {type:"boolean"}
#@markdown * If using shards, which shard index to start at (defualt 0 for first shard)
START_SHARD = 0 #@param {type:"integer"}
#@markdown Whether to use the latest checkpoint in the folder (set to false if an intermediate checkpoint should be used)
USE_LATEST = False #@param {type:"boolean"}
#@markdown * If not using latest checkpoint, which step's checkpoint to use
CHECKPOINT_STEP = 7000 #@param {type:"integer"}

total_metrics = {}  ## a dictionary for all metrics to  
                    ## print at the end during testing, 
                    ## not necessary during evaluation   
if dataset=="test":                  
  evaluating_file = "test.tf_record" 
elif dataset=="dev":                
  evaluating_file = "eval.tf_record" 
else:
  raise Exception("only datasets supported are dev and test")


DATA_INFOS = [["N/A" for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS]   ##create an empty 2D list to store all
              for BATCH_SIZE in BATCH_SIZES]                  ##the data info dictionaries

current_ckpts = [["N/A" for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS] for BATCH_SIZE in BATCH_SIZES]
for B,BATCH_SIZE in enumerate(BATCH_SIZES):
    for M,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):
        BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
        try:
          current_ckpts[M][m] = tf.train.latest_checkpoint(BERT_GCS_DIR)
        except:
          try:
            current_ckpts[M][m] = latest_checkpoint(BERT_GCS_DIR)
          except:
            raise Exception("could not find any checkpoints in the model dir specified")

def get_new_ckpts(current_ckpts):
  new_ckpts = []
  for M,MODEL_NAME in enumerate(MODELS):
    for m,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):
          BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
          try:
            current_ckpt = tf.train.latest_checkpoint(BERT_GCS_DIR)
            if current_ckpts[M][m]!=current_ckpt:
              new_ckpts.append([M,m])
          except:
            try:
              current_ckpt = latest_checkpoint(BERT_GCS_DIR)
              if current_ckpts[M][m]!=current_ckpt:
                new_ckpts.append([M,m])
            except:
              raise Exception("could not find any checkpoints in the model dir specified")
  return new_ckpts

while True:
  sleeping = True ##to prevent excessive interaction with GCS, 
                   ##if an eval/pred loop fails, the script 
                   ##will wait for a while before trying again

  if REPEAT_LOOP:                             ##if using REPEAT_LOOP, only evaluate on new checkpoints
    new_ckpts = get_new_ckpts(current_ckpts)
    if len(new_ckpts) == 0:
      tf.logging.info("No new checkpoints have been written since script start/last evaluation. Trying again in another "+str(CHECK_MODEL_EVERY_N_SECS)+" seconds.")

  for B,BATCH_SIZE in enumerate(BATCH_SIZES):
    for M,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):

      if REPEAT_LOOP:
        if [M,m] not in new_ckpts:
          continue

      tf.logging.info("\n\n\nBATCH SIZE:"+str(BATCH_SIZE)+
          "\nINPUT MAX SEQ LENGTH:"+str(MAX_SEQ_LENGTH))

      MODEL = MODEL_ARCHITECTURE
      current_ORIG_DATA_FOLDER= ORIG_DATA_FOLDER+"/"+str(MAX_SEQ_LENGTH)

      BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)
      DATA_GCS_DIR = BUCKET_PATH+"/"+PROCESSED_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)
          
      EVAL_WHILE_PREDICT_PREDICTIONS_DIR = BUCKET_PATH+"/"+PREDICTIONS_FOLDER+"/"+RUN_NAME+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)
      EVALUATIONS_DIR = EVALUATIONS_FOLDER+"/"+RUN_NAME+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)
      PREDICTIONS_DIR = PREDICTIONS_FOLDER+"/"+RUN_NAME+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)
      CONFIG_FILE = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)+"/config.json"
      
      if DATA_INFOS[B][M] == "N/A":
        DATA_INFOS[B][M] = json.load(tf.gfile.Open(DATA_GCS_DIR+"/info.json"))
      
      EX_DATA_NUM = DATA_INFOS[M][m]["ex_data_num"] if USING_EX_DATA else 0

      ##run the evaluation/prediction loop
      sucess,total_metrics,current_ckpt = \
              evaluation_loop(RUN_EVAL,
                              RUN_PREDICTION,
                              EVALUATE_WHILE_PREDICT,
                              dataset,
                              MODEL,
                              total_metrics,
                              MAX_SEQ_LENGTH,
                              current_ORIG_DATA_FOLDER,
                              BERT_GCS_DIR,
                              USE_LATEST,
                              CHECKPOINT_STEP,
                              DATA_GCS_DIR,
                              USING_SHARDS,
                              START_SHARD,
                              USING_EX_DATA,
                              EX_DATA_NUM,
                              EVAL_WHILE_PREDICT_PREDICTIONS_DIR,
                              PREDICTIONS_DIR,
                              EVALUATIONS_DIR,
                              CONFIG_FILE)
        
      current_ckpts[B][M] = current_ckpt
      if sucess:
        sleeping = False
  time.sleep(CHECK_MODEL_EVERY_N_SECS if sleeping else 0)
  if not REPEAT_LOOP:
    break
if not REPEAT_LOOP and RUN_EVAL:
  tf.logging.info("Printing all metrics...\n\n")
  for evals_dir,metrics in total_metrics.items():
    tf.logging.info("Printing metrics for:"+evals_dir+"\n")
    for key,metric in metrics.items():
      tf.logging.info(key+":"+str(metric))
    tf.logging.info("\n")



###Just one model

In [None]:
#@markdown ### IO config
#@markdown Folder for where to load the finetuned model from
FINETUNED_MODEL_DIR = "bert_model_mrpc_all_preds_12L_try7" #@param {type:"string"}
#@markdown Model architecture to use BertModel indicates the original BERT, BertModelModified indicates MutFormer's architecture
MODEL_ARCHITECTURE = BertModelModified #@param
#@markdown Which folder inside of PREDICTIONS_DIR and EVALUATIONS_DIR to write predictions and evaluations, respectively, into:
RUN_NAME = "MRPC_all_preds_12L_try7" #@param {type:"string"}
#@markdown \
#@markdown 
#@markdown 
#@markdown ### Evaluation/prediction procedure config
#@markdown Whether to evaluate on the test set or the dev set ("test" or "dev")
dataset = "test" #@param{type:"string"}
#@markdown Whether or not to run evaluation
RUN_EVAL = False #@param {type:"boolean"}
#@markdown Whether or not to run prediction (in a seperate loop from evaluation; EVALUATE_WHILE_PREDICT will override this value to False)
RUN_PREDICTION = True #@param {type:"boolean"}
#@markdown Whether or not to repeat this operation in a loop (if performing parallel evaluation operation, set to True, False otherwise)
REPEAT_LOOP = False #@param {type:"boolean"}
#@markdown When using REPEAT_LOOP, how long to wait in between each loop before checking again for updated train progress:
CHECK_MODEL_EVERY_N_SECS =  20#@param {type:"integer"}
#@markdown If evaluating, whether or not to evaluate and predict results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is significant (if yes, prediction results will be written in the form of tfevent files into GCS that need to be viewed using the notebook titled "mutformer processing and viewing finetuning results")
#@markdown 
#@markdown Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown What batch size to use during evaluation (larger batch size will increase evaluation speed but may skip more datapoints)
EVAL_BATCH_SIZE = 64 #@param {type:"integer"}
#@markdown What sequence length to use
MAX_SEQ_LENGTH =  512#@param {type:"integer"}
#@markdown Whether or not testing/evaluating data was generated in shards
USING_SHARDS = False #@param {type:"boolean"}
#@markdown * If using shards, which shard index to start at (defualt 0 for first shard)
START_SHARD = 0 #@param {type:"integer"}
#@markdown Whether to use the latest checkpoint in the folder (set to false if an intermediate checkpoint should be used)
USE_LATEST = True #@param {type:"boolean"}
#@markdown * If not using latest checkpoint, which step's checkpoint to use
CHECKPOINT_STEP = 10000 #@param {type:"integer"}

total_metrics = {}  ## a dictionary for all metrics to  
                    ## print at the end during testing, 
                    ## not necessary during evaluation   
if dataset=="test":
  evaluating_file = "test.tf_record"
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
else:
  raise Exception("only datasets supported are dev and test")

BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR
try:
  current_ckpt = tf.train.latest_checkpoint(BERT_GCS_DIR)
except:
  try:
    current_ckpt = latest_checkpoint(BERT_GCS_DIR)
  except:
    raise Exception("could not find any checkpoints in the model dir specified")

def get_new_ckpt(current_ckpt):
  BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR
  try:
    new_ckpt = tf.train.latest_checkpoint(BERT_GCS_DIR)
  except:
    try:
      new_ckpt = latest_checkpoint(BERT_GCS_DIR)
    except:
      raise Exception("could not find any checkpoints in the model dir specified")
  return not (new_ckpt==current_ckpt)


while True:
  if REPEAT_LOOP:
    if not get_new_ckpt(current_ckpt):
      tf.logging.info("No new checkpoints have been written since script start/last evaluation. Trying again in another "+str(CHECK_MODEL_EVERY_N_SECS)+" seconds.")
      time.sleep(CHECK_MODEL_EVERY_N_SECS)
      continue  
  MODEL = MODEL_ARCHITECTURE
  current_ORIG_DATA_FOLDER = ORIG_DATA_FOLDER+"/"+str(MAX_SEQ_LENGTH)

  BERT_GCS_DIR = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR
  DATA_GCS_DIR = BUCKET_PATH+"/"+PROCESSED_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)

  EVAL_WHILE_PREDICT_PREDICTIONS_DIR = BUCKET_PATH+"/"+PREDICTIONS_FOLDER+"/"+RUN_NAME
  EVALUATIONS_DIR = EVALUATIONS_FOLDER+"/"+RUN_NAME
  PREDICTIONS_DIR = PREDICTIONS_FOLDER+"/"+RUN_NAME
  CONFIG_FILE = BUCKET_PATH+"/"+FINETUNED_MODEL_DIR+"/config.json"

  DATA_INFO = json.load(tf.gfile.Open(DATA_GCS_DIR+"/info.json"))   ##get the data info dictionary
  EX_DATA_NUM = DATA_INFO["ex_data_num"] if USING_EX_DATA else 0

  ##run the evaluation/prediction loop
  sucess,total_metrics,current_ckpt = \
          evaluation_loop(RUN_EVAL,
                          RUN_PREDICTION,
                          EVALUATE_WHILE_PREDICT,
                          dataset,
                          MODEL,
                          total_metrics,
                          MAX_SEQ_LENGTH,
                          current_ORIG_DATA_FOLDER,
                          BERT_GCS_DIR,
                          USE_LATEST,
                          CHECKPOINT_STEP,
                          DATA_GCS_DIR,
                          USING_SHARDS,
                          START_SHARD,
                          USING_EX_DATA,
                          EX_DATA_NUM,
                          EVAL_WHILE_PREDICT_PREDICTIONS_DIR,
                          PREDICTIONS_DIR,
                          EVALUATIONS_DIR,
                          CONFIG_FILE)

  if not sucess and REPEAT_LOOP:                        ##to prevent excessive interaction with GCS,  
    time.sleep(CHECK_MODEL_EVERY_N_SECS)                ##if an eval/pred loop fails, the script 
    continue                                            ##will wait for a while before trying again

  if not REPEAT_LOOP:
    break
if not REPEAT_LOOP and RUN_EVAL:
  tf.logging.info("Printing all metrics...\n\n")
  for evals_dir,metrics in total_metrics.items():
    tf.logging.info("Printing metrics for:"+evals_dir+"\n")
    for key,metric in metrics.items():
      tf.logging.info(key+":"+str(metric))
    tf.logging.info("\n")

