# Configure settings

In [None]:
#@markdown ## General Config
#@markdown Which task to perform: options are "MRPC" for paired sequence method, "RE" for single sequence method, or "NER" for single sequance per residue prediction (if you add more modes make sure to change the corresponding code segments)
MODE = "MRPC" #@param {type:"string"}
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
#@markdown ###### For if multiple models are being evaluated: xxx is the placeholder for the individual model identifier (if only one is being evaluated replace xx with the actual name of the model)
#@markdown \
#@markdown folder for where to save the finetuned model
MODEL_DIR_format = "bert_model_mrpc_xxx" #@param {type:"string"}
#@markdown folder for the pretrained model
INIT_MODEL_DIR_format = "bert_model_xxx" #@param {type:"string"}
DATA_DIR_format = "MRPC_all_snps" #@param {type:"string"}
LOGGING_DIR = "mrpc_loss_spam_model_comparison_final" #@param {type:"string"}
#@markdown specify a header for all output locations (set to "" to disable)
RUN_NAME_format = "MRPC_xxx" #@param {type:"string"}
VOC_FNAME = "vocab.txt" #@param {type:"string"}

#@markdown ### Training procedure config
EVAL_BATCH_SIZE =  64 #@param {type:"integer"}
NUM_TPU_CORES = 8 #@param {type:"integer"}

#Clone the repo

In [None]:
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://tianqitheodorejiang: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@github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

#Imports

In [None]:
%tensorflow_version 1.x
print("Authorize for GCS:")
auth.authenticate_user()
print("Authorize done")

import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import time


print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

from glob import glob
from google.colab import auth, drive
from tensorflow.keras.utils import Progbar

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_classifier import MrpcProcessor,REProcessor ##change this part if you add more modes--
from mutformer.run_ner_for_pathogenic import NERProcessor      ##--

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#@markdown ###### Whether or not to write logs to a file
DO_FILE_LOGGING = True #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown ###### If using file logging, what path to write logs to
  FILE_LOGGING_PATH = 'file_logging/spam.log' #@param {type:"string"}
  if not os.path.exists("/".join(FILE_LOGGING_PATH.split("/")[:-1])):
    os.makedirs("/".join(FILE_LOGGING_PATH.split("/")[:-1]))
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  raise Exception('Not connected to TPU runtime. TPU runtime must be used ')


if MODE=="MRPC": ##change this part if you added more modes
  processor = MrpcProcessor()
  script = run_classifier
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\", \"RE\",\"NER\"].")
label_list = processor.get_labels()


#Specify Data location/Mount Drive if needed (for autodetecting number of steps if doing evaluation later)


In [None]:
import os
import shutil

#@markdown folder in drive where the eval/test data is stored (can be a GCS path for large database inference, in this case, the folder path isn't actually used since prediction should be performed); xxx is the placeholder for the sequence length (if only using one single test set just put the actual folder here without xxx)
data_folder_format = "gs://theodore_jiang/MRPC_all_snp_benchmark" #@param {type: "string"}from google.colab import drive,auth
if "/content/drive" in data_folder_format:
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)
  FILES_PATH = "/content/drive/My Drive"
else:
  FILES_PATH = "gs://"+BUCKET_NAME






# Run Eval/prediction

This following section can perform evaluation and prediction on either the eval dataset or the test dataset. There are three different code segments to run:\
1.For if you benchmarked model/sequence length during finetuning and wish to evaluate each model \
2.For if you benchmarked sequence length/batch size during finetuning and wish to evaluate each model \
3.For only evaluating/predicting using a single model

Choose a desired code segment to run, select the desired options for evaluating/predicting and run only that specific code segment

Note: All evaluation results will be written into the previously specified logging directory under the mounted google drive. To view the results, use the colab notebook titled "mutformer processing and viewing finetuning results"

Depending on whether or not EVALUATE_WHILE_PREDICT is used, prediction results will either be written into GCS or google drive. To view them, the colab notebook titled "mutformer processing and viewing finetuning results" can also be used

###General Setup and definitions

In [None]:
#@markdown when testing on the "test" dataset, whether or not to ensure all dataponts are predicted (if so, make sure this option was also specified as True during data generation)
PRECISE_TESTING = False #@param {type:"boolean"}
#@markdown maximum batch size the runtime can handle during prediction without OOM for all models being evaluated/tested (for these modela on a colab runtime it's about 1024)
MAX_BATCH_SIZE =  1024 #@param {type:"integer"}

def write_metrics(metrics,dir):
  gs = metrics["global_step"]
  print("global step",gs)

  tf.compat.v1.disable_eager_execution()
  tf.reset_default_graph()  
  for key,value in metrics.items():
    print(key,value)
    x_scalar = tf.constant(value)
    first_summary = tf.summary.scalar(name=key, tensor=x_scalar)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(dir)
        sess.run(init)
        summary = sess.run(first_summary)
        writer.add_summary(summary, gs)
        writer.flush()
        print('Done with writing the scalar summary')
    time.sleep(1)
  if not os.path.exists(FILES_PATH+"/"+dir):
    os.makedirs(FILES_PATH+"/"+dir)
  cmd = "cp -r \""+dir+"/.\" \""+FILES_PATH+"/"+dir+"\""
  !{cmd}

def write_predictions(PREDICTIONS_FOLDER,
                      RESTORE_MODEL_NAME,
                      result,
                      result_trailing):
  if not os.path.exists(FILES_PATH+"/"+PREDICTIONS_FOLDER):
    os.makedirs(FILES_PATH+"/"+PREDICTIONS_FOLDER)
  with tf.gfile.Open(FILES_PATH+"/"+PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",RESTORE_MODEL_NAME)+"_predictions.txt", "w") as writer:
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
      output_line = "\t".join([str(k)+":"+str(v) for k,v in prediction.items()]) + "\n"
      writer.write(output_line)
    if result_trailing:
      for (i, prediction) in enumerate(result_trailing):
        output_line = "\t".join([str(k)+":"+str(v) for k,v in prediction.items()]) + "\n"
        writer.write(output_line)

## dictionary mapping model name to which architecture 
## to use (BertModel is a classic BERT, BertModelModified 
## has the convs for multi-residue "vocabulary")
name2model = {                          
    "modified_large":BertModelModified,
    "modified_medium":BertModelModified,
    "modified":BertModelModified,
    "orig":BertModel,
    "large":BertModel
}


def evaluation_loop(RUN_EVAL,
                    RUN_PREDICTION,
                    RESTORE_MODEL_NAME,
                    EVALUATE_WHILE_PREDICT,
                    dataset,
                    MODEL,
                    total_metrics,
                    current_ckpt,
                    DATA_SEQ_LENGTH,
                    current_data_folder_eval,
                    BERT_GCS_DIR,
                    DATA_GCS_DIR_EVAL,
                    USING_SHARDS,
                    GCS_PREDICTIONS_DIR,
                    GCS_LOGGING_DIR,
                    LOCAL_LOGGING_DIR,
                    CONFIG_FILE):

  print("Using data from:",DATA_GCS_DIR_EVAL)
  if RUN_EVAL:
    if dataset=="dev":
      try:
        data_path_eval = "/content/drive/My Drive/"+current_data_folder_eval+"/dev.tsv"
        lines = open(data_path_eval).read().split("\n")
        EVAL_STEPS = int(len(lines)/EVAL_BATCH_SIZE)
      except:
        def steps_getter(input_files):
          tot_sequences = 0
          for input_file in input_files:
            print("reading:",input_file)

            d = tf.data.TFRecordDataset(input_file)

            with tf.Session() as sess:
              tot_sequences+=sess.run(d.reduce(0, lambda x,_: x+1))

          return tot_sequences
        SEQUENCES_PER_EPOCH = steps_getter([DATA_GCS_DIR_EVAL+"/eval.tf_record"])
        EVAL_STEPS = int(SEQUENCES_PER_EPOCH/EVAL_BATCH_SIZE)
    else:
      def steps_getter(input_files):
        tot_sequences = 0
        for input_file in input_files:
          print("reading:",input_file)

          d = tf.data.TFRecordDataset(input_file)

          with tf.Session() as sess:
            tot_sequences+=sess.run(d.reduce(0, lambda x,_: x+1))

        return tot_sequences
      SEQUENCES_PER_EPOCH = steps_getter([DATA_GCS_DIR_EVAL+"/test.tf_record"])
      EVAL_STEPS = int(SEQUENCES_PER_EPOCH/EVAL_BATCH_SIZE)

  print("eval steps:",EVAL_STEPS)

  
  if EVALUATE_WHILE_PREDICT:
    cmd = "gsutil -m rm -r "+GCS_PREDICTIONS_DIR
    !{cmd}


  RESTORE_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

  if RUN_EVAL:
    if RESTORE_CHECKPOINT==current_ckpt:
      return False,None,current_ckpt

  current_ckpt=RESTORE_CHECKPOINT

  config = modeling.BertConfig.from_json_file(CONFIG_FILE)

  model_fn = script.model_fn_builder(
      bert_config=config,
      logging_dir=GCS_LOGGING_DIR,
      num_labels=len(label_list),
      init_checkpoint=None,
      restore_checkpoint=RESTORE_CHECKPOINT,
      init_learning_rate=0,
      decay_per_step=0,
      num_warmup_steps=10,
      use_tpu=True,
      use_one_hot_embeddings=True,
      bert=MODEL,
      test_results_dir=GCS_PREDICTIONS_DIR,
      yield_predictions=EVALUATE_WHILE_PREDICT)

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=BERT_GCS_DIR,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
  def max_multiple_under_value(max_value,multiple_base):
      return int(max_value/multiple_base)

  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=1,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=MAX_BATCH_SIZE)
    
  eval_file = os.path.join(DATA_GCS_DIR_EVAL, evaluating_file)

  eval_input_fn = script.file_based_input_fn_builder(
        input_file=eval_file,
        shards_folder=DATA_GCS_DIR_EVAL if USING_SHARDS else None,
        seq_length=DATA_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)


  tf.logging.info("***** Running evaluation *****")
  tf.logging.info("  Batch size = %d", EVAL_BATCH_SIZE)

  try:
    if RUN_EVAL:
      eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=EVAL_STEPS)
      print("\n\n\n\n\n\nEVAL METRICS:")
      for k,v in eval_metrics.items():
        print(k+":",v)
      print("\n\n\n\n\n\n\n")
      if dataset == "dev":
        write_metrics(eval_metrics,LOCAL_LOGGING_DIR)
      else:
        total_metrics[LOCAL_LOGGING_DIR] = eval_metrics
    if RUN_PREDICTION:
      result=estimator.predict(input_fn=eval_input_fn)
      if PRECISE_TESTING and dataset=="test":
        run_config_trailing = tf.contrib.tpu.RunConfig(
          cluster=tpu_cluster_resolver,
          model_dir=BERT_GCS_DIR,
          tpu_config=tf.contrib.tpu.TPUConfig(
              num_shards=1,
              per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

        estimator_trailing = tf.contrib.tpu.TPUEstimator(
            use_tpu=True,
            model_fn=model_fn,
            config=run_config_trailing,
            train_batch_size=1,
            predict_batch_size=1)
        test_file_trailing = os.path.join(DATA_GCS_DIR_EVAL, "test_trailing.tf_record")
        test_input_fn_trailing = script.file_based_input_fn_builder(
            input_file=test_file_trailing,
            seq_length=DATA_SEQ_LENGTH,
            is_training=False,
            drop_remainder=True)
        result_trailing=estimator_trailing.predict(input_fn=test_input_fn_trailing)
      else:
        result_trailing = None
      write_predictions(PREDICTIONS_FOLDER,
                        RESTORE_MODEL_NAME,
                        result,
                        result_trailing)
    return True,total_metrics,current_ckpt
  except Exception as e:
    print("FAILED:",e)
    return False,None,current_ckpt

###Model/Sequence Length

In [None]:
#@markdown whether or not to run evaluation
RUN_EVAL = False #@param {type:"boolean"}
#@markdown whether or not to run prediction in a seperate loop from evaluation (if using EVALUATE_WHILE_PREDICT, set to False)
RUN_PREDICTION = True #@param {type:"boolean"}
#@markdown if evaluating, whether or not to evaluate and write test results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is siginificant (the evalution loop itself will be slower due to writing tfevents) (if yes, prediction results will be written in the form of tfevent files into GCS, so use the notebook titled "mutformer processing and viewing finetuning results" to view them)
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown whether or not testing/evaluating data was generated in shards (for really large databases)
USING_SHARDS = True #@param {type:"boolean"}
#@markdown what folder to write predictions into (if using EVALUATE_WHILE_PREDICT, predictions will be written into this folder under GCS, otherwise predictions will be written to this folder under google drive)
PREDICTIONS_FOLDER = "mrpc_loss_spam_model_comparison_final_predictions" #@param {type:"string"}
#@markdown #####Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
#@markdown \
#@markdown ###### whether to evaluate/predict on the test set or the dev set ("test" or "dev") (test set will only run once, dev set will run continuously)
dataset = "test" #@param{type:"string"}
#@markdown ###### if using test set, which model ids to evaluate (eval set will only run on the active model, test will run on specified models)
models = ["orig","large","modified"] #@param
#@markdown ###### if using test set, which sequence lengthed models to evaluate
lengths = [256,512,1024] #@param

if dataset=="test":
  evaluating_file = "test.tf_record"
  total_metrics = {}
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
else:
  raise Exception("only datasets supported are dev and test")

current_ckpt = "N/A"

while True:
  for MODEL_NAME in models:
    for DATA_SEQ_LENGTH in lengths:
      if dataset == "dev":
        try:
          ##reading the identifiers from drive written by the training script to know what to evaluate
          MODEL_NAME = open(FILES_PATH+"/finetuning_run_paired_model.txt").read()
          DATA_SEQ_LENGTH = int(open(FILES_PATH+"/finetuning_run_paired_seq_length.txt").read())
        except:
          print("Models haven't started training yet...checking again in 60 seconds")
          time.sleep(60)
          continue

      print("\n\n\nMODEL NAME:",MODEL_NAME,
            "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH)
      
      MODEL = name2model[MODEL_NAME]
      current_data_folder_eval= data_folder_format.replace("xxx",str(DATA_SEQ_LENGTH))
      RESTORE_MODEL_NAME = MODEL_DIR_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH))

      BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR_format.replace("xxx",MODEL_NAME)+"_"+str(DATA_SEQ_LENGTH))
      DATA_GCS_DIR_EVAL = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))
      
      GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))
      LOCAL_LOGGING_DIR = "{}/{}".format(LOGGING_DIR, RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))
      GCS_PREDICTIONS_DIR = "{}/{}".format(BUCKET_PATH, PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))

      CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))

      ##run the evaluation/prediction loop
      sucess,total_metrics,current_ckpt = \
              evaluation_loop(RUN_EVAL,
                              RUN_PREDICTION,
                              RESTORE_MODEL_NAME,
                              EVALUATE_WHILE_PREDICT,
                              dataset,
                              MODEL,
                              total_metrics,
                              current_ckpt,
                              DATA_SEQ_LENGTH,
                              current_data_folder_eval,
                              BERT_GCS_DIR,
                              DATA_GCS_DIR_EVAL,
                              USING_SHARDS,
                              GCS_PREDICTIONS_DIR,
                              GCS_LOGGING_DIR,
                              LOCAL_LOGGING_DIR,
                              CONFIG_FILE)

      if not sucess:
        time.sleep(30)
        continue

      if dataset=="dev":
        break
    if dataset=="dev":
      break
  if dataset=="test":
    break
if dataset == "test" and RUN_EVAL:
  for logging_dir,metrics in total_metrics.items():
    print("Printing metrics for:",logging_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")



###Batch Size/Sequence Length

In [None]:
#@markdown whether or not to run evaluation
RUN_EVAL = False #@param {type:"boolean"}
#@markdown whether or not to run prediction in a seperate loop from evaluation (if using EVALUATE_WHILE_PREDICT, set to False)
RUN_PREDICTION = True #@param {type:"boolean"}
#@markdown if evaluating, whether or not to evaluate and write test results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is siginificant (the evalution loop itself will be slower due to writing tfevents) (if yes, prediction results will be written in the form of tfevent files into GCS, so use the notebook titled "mutformer processing and viewing finetuning results" to view them)
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown whether or not testing/evaluating data was generated in shards (for really large databases)
USING_SHARDS = True #@param {type:"boolean"}
#@markdown what folder to write predictions into (if using EVALUATE_WHILE_PREDICT, predictions will be written into this folder under GCS, otherwise predictions will be written to this folder under google drive)
PREDICTIONS_FOLDER = "mrpc_loss_spam_model_comparison_final_predictions" #@param {type:"string"}
#@markdown #####Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
#@markdown \
#@markdown ###### whether to evaluate on the test set or the dev set ("test" or "dev") (test set will only run once, dev set will run continuously)
dataset = "test" #@param{type:"string"}
#@markdown which model id to evaluate
MODEL_NAME="modified_large" #@param {type:"string"}
#@markdown ###### if using test set, which batch sized models to evaluate (eval will only run on the active model, test will run on specified models)
batch_sizes = [32,16,64] #@param
#@markdown ###### if using test set, which sequence lengthed models to evaluate
lengths = [256,512,1024] #@param

if dataset=="test":                  ## a dictionary for all metrics to 
  evaluating_file = "test.tf_record" ## print at the end during testing,
  total_metrics = {}                 ## not necessary during evaluation   
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
  total_metrics = None
else:
  raise Exception("only datasets supported are dev and test")

current_ckpt = "N/A"

while True:
  for BATCH_SIZE in batch_sizes:
    for DATA_SEQ_LENGTH in lengths:
      if dataset == "dev":
        try:
          ##reading the identifiers from drive written by the training script to know what to evaluate
          BATCH_SIZE = int(open(FILES_PATH+"/finetuning_run_paired_batch_size.txt").read())
          DATA_SEQ_LENGTH = int(open(FILES_PATH+"/finetuning_run_paired_seq_length.txt").read())
        except:
          print("Models haven't started training yet...checking again in 60 seconds")
          time.sleep(60)
          continue
      print("\n\n\nMODEL NAME:",MODEL_NAME,
            "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH,
            "\nBATCH_SIZE_FINETUNED_ON:",BATCH_SIZE,"\n\n\n")

      MODEL = name2model[MODEL_NAME]
      current_data_folder_eval= data_folder_format.replace("xxx",str(DATA_SEQ_LENGTH))
      RESTORE_MODEL_NAME = MODEL_DIR_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE))

      BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, RESTORE_MODEL_NAME)
      DATA_GCS_DIR_EVAL = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))
      
      GCS_PREDICTIONS_DIR = "{}/{}".format(BUCKET_PATH, PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))
      GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))
      LOCAL_LOGGING_DIR = "{}/{}".format(LOGGING_DIR, RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))

      CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))
      
      ##run the evaluation/prediction loop
      sucess,total_metrics,current_ckpt = \
              evaluation_loop(RUN_EVAL,
                              RUN_PREDICTION,
                              RESTORE_MODEL_NAME,
                              EVALUATE_WHILE_PREDICT,
                              dataset,
                              MODEL,
                              total_metrics,
                              current_ckpt,
                              DATA_SEQ_LENGTH,
                              current_data_folder_eval,
                              BERT_GCS_DIR,
                              DATA_GCS_DIR_EVAL,
                              USING_SHARDS,
                              GCS_PREDICTIONS_DIR,
                              GCS_LOGGING_DIR,
                              LOCAL_LOGGING_DIR,
                              CONFIG_FILE)

      if not sucess:
        time.sleep(30)
        continue
      if dataset=="dev":
        break
    if dataset=="dev":
      break
  if dataset=="test":
    break
if dataset == "test" and RUN_EVAL:
  for logging_dir,metrics in total_metrics.items():
    print("Printing metrics for:",logging_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")



###Just one model

In [None]:
#@markdown whether or not to run evaluation
RUN_EVAL = True #@param {type:"boolean"}
#@markdown whether or not to run prediction in a seperate loop from evaluation (if using EVALUATE_WHILE_PREDICT, set to False)
RUN_PREDICTION = False #@param {type:"boolean"}
#@markdown if evaluating, whether or not to evaluate and write test results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is siginificant (the evalution loop itself will be slower due to writing tfevents) (if yes, prediction results will be written in the form of tfevent files into GCS, so use the notebook titled "mutformer processing and viewing finetuning results" to view them)
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown whether or not testing/evaluating data was generated in shards (for really large databases)
USING_SHARDS = True #@param {type:"boolean"}
#@markdown what folder to write predictions into (if using EVALUATE_WHILE_PREDICT, predictions will be written into this folder under GCS, otherwise predictions will be written to this folder under google drive)
PREDICTIONS_FOLDER = "all_snp_prediction" #@param {type:"string"}
#@markdown #####Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
#@markdown \
#@markdown ###### whether to evaluate on the test set or the dev set ("test" or "dev") (test set will only run once, dev set will run continuously)
dataset = "test" #@param{type:"string"}
#@markdown which model id to evaluate
MODEL_NAME="modified_medium" #@param {type:"string"}
#@markdown ###### if using test set, which sequence lengthed models to evaluate
DATA_SEQ_LENGTH = 512 #@param

if dataset=="test":
  evaluating_file = "test.tf_record"
  total_metrics = {}
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
else:
  raise Exception("only datasets supported are dev and test")

current_ckpt = "N/A"

while True:
  print("\n\n\nMODEL NAME:",MODEL_NAME,
      "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH,
  MODEL = name2model[MODEL_NAME]
  current_data_folder_eval= data_folder_format
  RESTORE_MODEL_NAME = MODEL_DIR_format.replace("xxx",MODEL_NAME)

  BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR_format)  
  DATA_GCS_DIR_EVAL = "{}/{}".format(BUCKET_PATH, DATA_DIR_format)

  GCS_PREDICTIONS_DIR = "{}/{}".format(BUCKET_PATH, PREDICTIONS_FOLDER+"/"+RUN_NAME_format)
  GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME_format)
  LOCAL_LOGGING_DIR = "{}/{}".format(LOGGING_DIR, RUN_NAME_format)

  CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format)

  ##run the evaluation/prediction loop
  sucess,total_metrics,current_ckpt = \
          evaluation_loop(RUN_EVAL,
                          RUN_PREDICTION,
                          RESTORE_MODEL_NAME,
                          EVALUATE_WHILE_PREDICT,
                          dataset,
                          MODEL,
                          total_metrics,
                          current_ckpt,
                          DATA_SEQ_LENGTH,
                          current_data_folder_eval,
                          BERT_GCS_DIR,
                          DATA_GCS_DIR_EVAL,
                          USING_SHARDS,
                          GCS_PREDICTIONS_DIR,
                          GCS_LOGGING_DIR,
                          LOCAL_LOGGING_DIR,
                          CONFIG_FILE)

  if not sucess:
    time.sleep(30)
    continue

  if dataset=="test":
    break
if dataset == "test" and RUN_EVAL:
  for logging_dir,metrics in total_metrics.items():
    print("Printing metrics for:",logging_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")

