#Finetuning Evaluation and Prediction Script

This notebook evlauates and performs predictions on test data using finetuned models

# Configure settings

In [None]:
#@markdown ## General Config
#@markdown In the case that an inference database is large and a long duration of continuous runtime is required, a GCP TPU/runtime to run this notebook may be desirable. If that's the case, specify here:
USE_GCP_TPU = False #@param {type:"boolean"}
#@markdown Which task to perform: options are "MRPC" for paired sequence method, "RE" for single sequence method, or "NER" for single sequance per residue prediction (if you add more modes make sure to change the corresponding code segments)
MODE = "MRPC_w_ex_data" #@param {type:"string"}
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
#@markdown ###### For if multiple models are being evaluated: xxx is the placeholder for the individual model identifier (if only one is being evaluated replace xx with the actual name of the model)
#@markdown \
#@markdown folder prefix for where to load the finetuned model from
MODEL_DIR_format = "bert_model_mrpc_adding_preds_only_others_xxx" #@param {type:"string"}
#@markdown folder where the config.json file was stored in
INIT_MODEL_DIR_format = "bert_model_xxx" #@param {type:"string"}
#@markdown folder prefix for where to get the finetuning data
DATA_DIR_format = "MRPC_adding_preds_only_others_xxx" #@param {type:"string"}
#@markdown specify a header for all output locations (set to "" to disable)
RUN_NAME_format = "MRPC_adding_preds_only_others_xxx" #@param {type:"string"}

#@markdown ### Eval procedure config
EVAL_BATCH_SIZE =  64 #@param {type:"integer"}
NUM_TPU_CORES = 8 #@param {type:"integer"}

#If running on a GCP runtime, follow these instructions to set it up

###1) Create a VM from the GCP website
###2) Open a command prompt on your computer and perform the following steps"
To ssh into the VM:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Note: Make sure the port above matches the port below (in this case it's 8888)
\
\
Run each of these commands individually, or copy and paste the one command below:
```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
One command:
```
sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
###3) In this notebook, to connect to this runtime, click the "connect to local runtime" option under the connect button, and copy and paste the outputted link with "locahost: ..."
###4) Finally, run this code segment, which creates a TPU


In [None]:
GCE_PROJECT_NAME = "genome-project-319100" #@param {type:"string"}
TPU_ZONE = "us-central1-f" #@param {type:"string"}
TPU_NAME = "mutformer-tpu" #@param {type:"string"}

!gcloud alpha compute tpus create $TPU_NAME --accelerator-type=tpu-v2 --version=1.15.5 --zone=$TPU_ZONE ##create new TPU

!gsutil iam ch serviceAccount:`gcloud alpha compute tpus describe $TPU_NAME | grep serviceAccount | cut -d' ' -f2`:admin gs://theodore_jiang && echo 'Successfully set permissions!' ##give TPU access to GCS

#Clone the repo

In [None]:
if USE_GCP_TPU:
  !sudo apt-get -y install git
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

#Imports/authorize GCP

In [None]:
if not USE_GCP_TPU:
  %tensorflow_version 1.x
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

import sys
import json
import random
import logging
import tensorflow as tf
import time
import importlib

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_classifier import MrpcWithPredsProcessor,MrpcProcessor,REProcessor ##change this part if you add more modes--
from mutformer.run_ner_for_pathogenic import NERProcessor      ##--


##reload modules in case that's needed
modules2reload = [modeling, 
                  optimization, 
                  tokenization,
                  run_classifier,
                  run_ner_for_pathogenic]
for module in modules2reload:
    importlib.reload(module)

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#@markdown ###### Whether or not to write logs to a file
DO_FILE_LOGGING = True #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown ###### If using file logging, what path to write logs to
  FILE_LOGGING_PATH = 'file_logging/spam.log' #@param {type:"string"}
  if not os.path.exists("/".join(FILE_LOGGING_PATH.split("/")[:-1])):
    os.makedirs("/".join(FILE_LOGGING_PATH.split("/")[:-1]))
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

if USE_GCP_TPU:
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_NAME, zone=TPU_ZONE, project=GCE_PROJECT_NAME)
  TPU_ADDRESS = tpu_cluster_resolver.get_master()
  with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      tf.contrib.cloud.configure_gcs(session)
else:
  if 'COLAB_TPU_ADDR' in os.environ:
    log.info("Using TPU runtime")
    TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

    with tf.Session(TPU_ADDRESS) as session:
      log.info('TPU address is ' + TPU_ADDRESS)
      # Upload credentials to TPU.
      with open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
      tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
      
  else:
    raise Exception('Not connected to TPU runtime, TPU required to run mutformer')


if MODE=="MRPC": ##change this part if you added more modes
  processor = MrpcProcessor()
  script = run_classifier
elif MODE=="MRPC_w_ex_data":
  processor = MrpcWithPredsProcessor()
  script = run_classifier
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\", \"RE\",\"NER\"].")
label_list = processor.get_labels()


#Specify location preferences for google drive vs GCS/Mount Drive if needed (for autodetecting number of steps if doing evaluation later)




In [None]:
import os
import shutil

#@markdown ###### Note: for all of these, if using USE_GCP_TPU, all of these parameters must use GCS, because a GCP TPU can't access google drive
#@markdown \
#@markdown if not using USE_GCP_TPU, drive path for where the data was stored to detect # of steps in data (this is just to limit interaction with GCS; item can also be left blank and steps will be automatically detected from tfrecords)
data_folder_format = "/content/drive/My Drive/BERT finetuning/MRPC/w_added_modified_bert_mrpc_512" #@param {type: "string"}
DRIVE_PATH = "/content/drive/My Drive"
#@markdown whether to use GCS for communicating with training script, if not, defaults to drive
GCS_COMS = False #@param {type:"boolean"}
#@markdown whether to use GCS for writing predictions, if not, defaults to drive
GCS_PREDICTIONS = False #@param {type:"boolean"}
#@markdown whether to use GCS for writing eval results, if not, defaults to drive
GCS_EVAL = False #@param {type:"boolean"}

COMS_PATH = BUCKET_PATH if GCS_COMS else DRIVE_PATH
PREDS_PATH = BUCKET_PATH if GCS_PREDICTIONS else DRIVE_PATH
EVALS_PATH = BUCKET_PATH if GCS_EVAL else DRIVE_PATH

if USE_GCP_TPU:
  FILES_PATH = BUCKET_PATH

if "/content/drive" in data_folder_format or not GCS_COMS or not GCS_PREDICTIONS or not GCS_EVAL:
  from google.colab import drive,auth
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)
  






# Run Eval/prediction

This following section will perform evaluation and prediction on either the eval dataset or the test dataset.

###General Setup and definitions

In [None]:
#@markdown when testing on the "test" dataset, whether or not to ensure all dataponts are predicted (if so, make sure this option was also specified as True during data generation)
PRECISE_TESTING = True #@param {type:"boolean"}
#@markdown maximum batch size the runtime can handle during prediction without OOM for all models being evaluated/tested (for these modela on a colab runtime it's about 1024)
MAX_BATCH_SIZE =  512 #@param {type:"integer"}

def write_metrics(metrics,dir):
  gs = metrics["global_step"]
  print("global step",gs)

  tf.compat.v1.disable_eager_execution()
  tf.reset_default_graph()  
  for key,value in metrics.items():
    print(key,value)
    x_scalar = tf.constant(value)
    first_summary = tf.summary.scalar(name=key, tensor=x_scalar)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(dir)
        sess.run(init)
        summary = sess.run(first_summary)
        writer.add_summary(summary, gs)
        writer.flush()
        print('Done with writing the scalar summary')
    time.sleep(1)
  if not os.path.exists(EVALS_PATH+"/"+dir):
    os.makedirs(EVALS_PATH+"/"+dir)
  if "gs:" in EVALS_PATH:
    cmd = "gsutil cp -r \""+dir+"/.\" \""+EVALS_PATH+"/"+dir+"\""
  else:
    cmd = "cp -r \""+dir+"/.\" \""+EVALS_PATH+"/"+dir+"\""
  !{cmd}

def write_predictions(PREDICTIONS_FOLDER,
                      RESTORE_MODEL_NAME,
                      result,
                      result_trailing,
                      shard_id=""):
  if not os.path.exists(PREDS_PATH+"/"+PREDICTIONS_FOLDER):
    os.makedirs(PREDS_PATH+"/"+PREDICTIONS_FOLDER)
  with tf.gfile.Open(PREDS_PATH+"/"+PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",RESTORE_MODEL_NAME)+"_predictions"+shard_id+".txt", "w") as writer:
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
      output_line = "\t".join([str(k)+":"+str(v) for k,v in prediction.items()]) + "\n"
      writer.write(output_line)
    if result_trailing:
      for (i, prediction) in enumerate(result_trailing):
        output_line = "\t".join([str(k)+":"+str(v) for k,v in prediction.items()]) + "\n"
        writer.write(output_line)

## dictionary mapping model name to which architecture 
## to use (BertModel is a classic BERT, BertModelModified 
## has the convs for multi-residue "vocabulary")
name2model = {                          
    "modified_large":BertModelModified,
    "modified_medium":BertModelModified,
    "modified":BertModelModified,
    "orig":BertModel,
    "large":BertModel
}


def evaluation_loop(RUN_EVAL,
                    RUN_PREDICTION,
                    RESTORE_MODEL_NAME,
                    EVALUATE_WHILE_PREDICT,
                    dataset,
                    MODEL,
                    total_metrics,
                    current_ckpt,
                    DATA_SEQ_LENGTH,
                    current_data_folder_eval,
                    BERT_GCS_DIR,
                    USE_LATEST,
                    CHECKPOINT_STEP,
                    DATA_GCS_DIR_EVAL,
                    USING_SHARDS,
                    START_SHARD,
                    USING_PREDS,
                    PRED_NUM,
                    GCS_PREDICTIONS_DIR,
                    GCS_EVALUATIONS_DIR,
                    PREDICTIONS_FOLDER,
                    EVALUATIONS_FOLDER,
                    LOCAL_EVALUATIONS_DIR,
                    CONFIG_FILE):

  print("Using data from:",DATA_GCS_DIR_EVAL)

  eval_file = os.path.join(DATA_GCS_DIR_EVAL, evaluating_file)

  def steps_getter(input_files):
    tot_sequences = 0
    for input_file in input_files:
      print("reading:",input_file)

      d = tf.data.TFRecordDataset(input_file)

      with tf.Session() as sess:
        tot_sequences+=sess.run(d.reduce(0, lambda x,_: x+1))

    return tot_sequences
  if USING_SHARDS:
    shards_folder = DATA_GCS_DIR_EVAL
    input_file = os.path.join(DATA_GCS_DIR_EVAL, evaluating_file)
    import re
    file_name = input_file.split("/")[-1]
    shards = [shards_folder + "/" + file for file in tf.io.gfile.listdir(shards_folder) if
              re.match(file_name + "_\d+", file)]
    shards = sorted(shards,key=lambda shard:int(shard.split("_")[-1]))[START_SHARD:]
  else:
    shards = [eval_file]

  if USING_SHARDS:
    print("\nUSING SHARDs:")
    for shard in shards:
      print(shard)
    print("\n")

  if RUN_EVAL:
    if dataset=="dev":
      try:
        data_path_eval = "/content/drive/My Drive/"+current_data_folder_eval+"/dev.tsv"
        lines = open(data_path_eval).read().split("\n")
        EVAL_STEPS = int(len(lines)/EVAL_BATCH_SIZE)
      except:
        SEQUENCES_PER_EPOCH = steps_getter(shards)
        EVAL_STEPS = int(SEQUENCES_PER_EPOCH/EVAL_BATCH_SIZE)
    else:
      SEQUENCES_PER_EPOCH = steps_getter(shards)
      EVAL_STEPS = int(SEQUENCES_PER_EPOCH/EVAL_BATCH_SIZE)

    print("eval steps:",EVAL_STEPS)

  
  if EVALUATE_WHILE_PREDICT:
    cmd = "gsutil -m rm -r "+GCS_PREDICTIONS_DIR
    !{cmd}
  if USE_LATEST:
    RESTORE_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)
  else:
    latest_ckpt = tf.train.latest_checkpoint(BERT_GCS_DIR).split("/")[-1]
    RESTORE_CHECKPOINT = [".".join(ckpt.split(".")[:-1]) 
                          for ckpt in tf.io.gfile.listdir(BERT_GCS_DIR) 
                          if len(ckpt.split("."))==3 and str(CHECKPOINT_STEP) == ckpt.split(".")[-2].split("-")[-1]][0]
    file_lines = tf.gfile.Open(BERT_GCS_DIR+"/checkpoint").read().split("\n")
    file_lines[0] = file_lines[0].replace(latest_ckpt,RESTORE_CHECKPOINT)
    RESTORE_CHECKPOINT = BERT_GCS_DIR+"/"+RESTORE_CHECKPOINT

    tf.gfile.Open(BERT_GCS_DIR+"/checkpoint","w+").write("\n".join(file_lines))
  if RUN_EVAL:
    if RESTORE_CHECKPOINT==current_ckpt:
      return False,None,current_ckpt

  current_ckpt=RESTORE_CHECKPOINT
  print("USING CHECKPOINT:",current_ckpt)
  config = modeling.BertConfig.from_json_file(CONFIG_FILE)

  model_fn = script.model_fn_builder(
      bert_config=config,
      num_labels=len(label_list),
      init_checkpoint=None,
      restore_checkpoint=RESTORE_CHECKPOINT,
      init_learning_rate=0,
      decay_per_step=0,
      num_warmup_steps=10,
      use_tpu=True,
      use_one_hot_embeddings=True,
      bert=MODEL,
      test_results_dir=GCS_PREDICTIONS_DIR,
      yield_predictions=EVALUATE_WHILE_PREDICT,
      using_preds=USING_PREDS)

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=BERT_GCS_DIR,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=1,
      eval_batch_size=EVAL_BATCH_SIZE,
      predict_batch_size=MAX_BATCH_SIZE)
    
  print("USING FILE:",eval_file)
  try:
    for n,shard in enumerate(shards):
      input_fn = script.file_based_input_fn_builder(
            input_file=shard,
            seq_length=DATA_SEQ_LENGTH,
            is_training=False,
            drop_remainder=True,
            pred_num=PRED_NUM if USING_PREDS else None)


      tf.logging.info("***** Running evaluation/prediction *****")
      tf.logging.info(" Eval Batch size = %d", EVAL_BATCH_SIZE)
      tf.logging.info(" Predict Batch size = %d", MAX_BATCH_SIZE)
      
    
      if RUN_EVAL:
        eval_metrics = estimator.evaluate(input_fn=input_fn, steps=EVAL_STEPS)
        print("\n\n\n\n\n\nEVAL METRICS:")
        for k,v in eval_metrics.items():
          print(k+":",v)
        print("\n\n\n\n\n\n\n")
        if dataset == "dev":
          write_metrics(eval_metrics,LOCAL_EVALUATIONS_DIR)
        else:
          total_metrics[LOCAL_EVALUATIONS_DIR] = eval_metrics
      if RUN_PREDICTION:
        result=estimator.predict(input_fn=input_fn)
        if PRECISE_TESTING and RUN_PREDICTION and n==len(shards)-1:
          run_config_trailing = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=BERT_GCS_DIR,
            tpu_config=tf.contrib.tpu.TPUConfig(
                num_shards=1,
                per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

          estimator_trailing = tf.contrib.tpu.TPUEstimator(
              use_tpu=True,
              model_fn=model_fn,
              config=run_config_trailing,
              train_batch_size=1,
              predict_batch_size=1)
          test_file_trailing = os.path.join(DATA_GCS_DIR_EVAL, "test_trailing.tf_record")
          test_input_fn_trailing = script.file_based_input_fn_builder(
              input_file=test_file_trailing,
              seq_length=DATA_SEQ_LENGTH,
              is_training=False,
              drop_remainder=True,
              pred_num=PRED_NUM if USING_PREDS else None)
          result_trailing=estimator_trailing.predict(input_fn=test_input_fn_trailing)
        else:
          result_trailing = None
        write_predictions(PREDICTIONS_FOLDER,
                          RESTORE_MODEL_NAME,
                          result,
                          result_trailing,
                          shard_id=str(START_SHARD+n)if USING_SHARDS else "")
    return True,total_metrics,current_ckpt
  except Exception as e:
    print("FAILED:",e)
    return False,None,current_ckpt
  

Following are three different code segments to run:
1. For if you benchmarked model/sequence length during finetuning and wish to evaluate each model \
2. For if you benchmarked sequence length/batch size during finetuning and wish to evaluate each model \
3. For only evaluating/predicting using a single model

Choose a desired code segment to run, select the desired options for evaluating/predicting, and run that code segment
\
\
Note: All evaluation results will be written into the previously specified logging directory either under google drive or GCS. To view the results, use the colab notebook titled "mutformer processing and viewing finetuning results," which can also be used to view prediction results

###Model/Sequence Length

In [None]:
#@markdown whether or not to run evaluation
RUN_EVAL = True #@param {type:"boolean"}
#@markdown whether or not to run prediction in a seperate loop from evaluation (if using EVALUATE_WHILE_PREDICT, set to False)
RUN_PREDICTION = False #@param {type:"boolean"}
#@markdown if evaluating, whether or not to evaluate and write test results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is siginificant (the evalution loop itself will be slower due to writing tfevents) (if yes, prediction results will be written in the form of tfevent files into GCS, so use the notebook titled "mutformer processing and viewing finetuning results" to view them)
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown whether or not testing/evaluating data was generated in shards (for really large databases)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown if using shards, which shard index to start at (defualt 0 for first shard)
START_SHARD =   0#@param {type:"integer"}
#@markdown whether or not external data is being used
USING_PREDS = True #@param {type:"boolean"}
#@markdown if using external data, how many datapoints are included in total
PRED_NUM =   22#@param {type:"integer"}
#@markdown what folder to write predictions into (if using EVALUATE_WHILE_PREDICT or data source was a GCS path, predictions will be written into this folder under GCS, otherwise predictions will be written to this folder under google drive)
PREDICTIONS_FOLDER = "mrpc_loss_spam_model_comparison_final_predictions" #@param {type:"string"}
#@markdown what folder to write evaluation results into 
EVALUATIONS_FOLDER = "mrpc_loss_spam_model_comparison_final" #@param {type:"string"}
#@markdown #####Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
#@markdown \
#@markdown ###### whether to evaluate/predict on the test set or the dev set ("test" or "dev") (test set will only run once, dev set will run continuously)
dataset = "test" #@param{type:"string"}
#@markdown ###### if using test set, which model ids to evaluate (eval set will only run on the active model, test will run on specified models)
models = ["orig","large","modified"] #@param
#@markdown ###### if using test set, which sequence lengthed models to evaluate
lengths = [256,512,1024] #@param
#@markdown ###### whether to use the latest checkpoint in the folder (set to false if an intermediate checkpoint should be used)
USE_LATEST = False #@param {type:"boolean"}
#@markdown ###### if not using the latest checkpoint, which checkpoint to use
CHECKPOINT_STEP =  7000#@param {type:"integer"}

if dataset=="test":
  evaluating_file = "test.tf_record"
  total_metrics = {}
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
else:
  raise Exception("only datasets supported are dev and test")

current_ckpt = "N/A"

while True:
  for MODEL_NAME in models:
    for DATA_SEQ_LENGTH in lengths:
      if RUN_EVAL:
        try:
          ##reading the identifiers from drive written by the training script to know what to evaluate
          MODEL_NAME = tf.gfile.Open(FILES_PATH+"/finetuning_run_paired_model.txt").read()
          DATA_SEQ_LENGTH = int(tf.gfile.Open(FILES_PATH+"/finetuning_run_paired_seq_length.txt").read())
        except:
          print("Models haven't started training yet...checking again in 60 seconds")
          time.sleep(60)
          continue

      print("\n\n\nMODEL NAME:",MODEL_NAME,
            "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH)
      
      MODEL = name2model[MODEL_NAME]
      current_data_folder_eval= data_folder_format.replace("xxx",str(DATA_SEQ_LENGTH))
      RESTORE_MODEL_NAME = MODEL_DIR_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH))

      BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR_format.replace("xxx",MODEL_NAME)+"_"+str(DATA_SEQ_LENGTH))
      DATA_GCS_DIR_EVAL = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))
      
      GCS_EVALUATIONS_DIR = "{}/{}".format(BUCKET_PATH, EVALUATIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))
      LOCAL_EVALUATIONS_DIR = "{}/{}".format(EVALUATIONS_FOLDER, RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))
      GCS_PREDICTIONS_DIR = "{}/{}".format(BUCKET_PATH, PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))

      CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))

      ##run the evaluation/prediction loop
      sucess,total_metrics,current_ckpt = \
              evaluation_loop(RUN_EVAL,
                              RUN_PREDICTION,
                              RESTORE_MODEL_NAME,
                              EVALUATE_WHILE_PREDICT,
                              dataset,
                              MODEL,
                              total_metrics,
                              current_ckpt,
                              DATA_SEQ_LENGTH,
                              current_data_folder_eval,
                              BERT_GCS_DIR,
                              USE_LATEST,
                              CHECKPOINT_STEP,
                              DATA_GCS_DIR_EVAL,
                              USING_SHARDS,
                              START_SHARD,
                              USING_PREDS,
                              PRED_NUM,
                              GCS_PREDICTIONS_DIR,
                              GCS_EVALUATIONS_DIR,
                              PREDICTIONS_FOLDER,
                              EVALUATIONS_FOLDER,
                              LOCAL_EVALUATIONS_DIR,
                              CONFIG_FILE)


      if not sucess:
        time.sleep(30)
        continue

      if RUN_EVAL:
        break
    if RUN_EVAL:
      break
  if RUN_PREDICTION:
    break
if RUN_PREDICTION and RUN_EVAL:
  for evals_dir,metrics in total_metrics.items():
    print("Printing metrics for:",evals_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")



###Batch Size/Sequence Length

In [None]:
#@markdown whether or not to run evaluation
RUN_EVAL = False #@param {type:"boolean"}
#@markdown whether or not to run prediction in a seperate loop from evaluation (if using EVALUATE_WHILE_PREDICT, set to False)
RUN_PREDICTION = True #@param {type:"boolean"}
#@markdown if evaluating, whether or not to evaluate and write test results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is siginificant (the evalution loop itself will be slower due to writing tfevents) (if yes, prediction results will be written in the form of tfevent files into GCS, so use the notebook titled "mutformer processing and viewing finetuning results" to view them)
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown whether or not testing/evaluating data was generated in shards (for really large databases)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown if using shards, which shard index to start at (defualt 0 for first shard)
START_SHARD =   0#@param {type:"integer"}
#@markdown whether or not external data is being used
USING_PREDS = True #@param {type:"boolean"}
#@markdown if using external data, how many datapoints are included in total
PRED_NUM =   22#@param {type:"integer"}
#@markdown what folder to write predictions into 
PREDICTIONS_FOLDER = "mrpc_loss_spam_model_comparison_final_predictions" #@param {type:"string"}
#@markdown what folder to write evaluation results into 
EVALUATIONS_FOLDER = "mrpc_loss_spam_model_comparison_final" #@param {type:"string"}
#@markdown #####Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the notebook titled "mutformer processing and viewing finetuning results" , otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
#@markdown \
#@markdown ###### whether to evaluate on the test set or the dev set ("test" or "dev") (test set will only run once, dev set will run continuously)
dataset = "test" #@param{type:"string"}
#@markdown which model id to evaluate
MODEL_NAME="modified" #@param {type:"string"}
#@markdown ###### if using test set, which batch sized models to test 
batch_sizes = [32,16,64] #@param
#@markdown ###### if using test set, which sequence lengthed models to test
lengths = [256,512,1024] #@param
#@markdown ###### whether to use the latest checkpoint in the folder (set to false if an intermediate checkpoint should be used)
USE_LATEST = False #@param {type:"boolean"}
#@markdown ###### if not using the latest checkpoint, which checkpoint to use
CHECKPOINT_STEP =  7000#@param {type:"integer"}

if dataset=="test":                  
  evaluating_file = "test.tf_record" 
  total_metrics = {}  ## a dictionary for all metrics to  
                      ## print at the end during testing, 
                      ## not necessary during evaluation   
elif dataset=="dev":                
  evaluating_file = "eval.tf_record" 
  total_metrics = None
else:
  raise Exception("only datasets supported are dev and test")

current_ckpt = "N/A"

while True:
  for BATCH_SIZE in batch_sizes:
    for DATA_SEQ_LENGTH in lengths:
      if RUN_EVAL:
        try:
          ##reading the identifiers from drive written by the training script to know what to evaluate
          BATCH_SIZE = int(tf.gfile.Open(COMS_PATH+"/finetuning_run_paired_batch_size.txt").read())
          DATA_SEQ_LENGTH = int(tf.gfile.Open(COMS_PATH+"/finetuning_run_paired_seq_length.txt").read())
        except:
          print("Models haven't started training yet...checking again in 60 seconds")
          time.sleep(60)
          continue
      print("\n\n\nMODEL NAME:",MODEL_NAME,
            "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH,
            "\nBATCH_SIZE_FINETUNED_ON:",BATCH_SIZE,"\n\n\n")

      MODEL = name2model[MODEL_NAME]
      current_data_folder_eval= data_folder_format.replace("xxx",str(DATA_SEQ_LENGTH))
      RESTORE_MODEL_NAME = MODEL_DIR_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE))

      BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, RESTORE_MODEL_NAME)
      DATA_GCS_DIR_EVAL = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))
      
      GCS_PREDICTIONS_DIR = "{}/{}".format(BUCKET_PATH, PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))
      GCS_EVALUATIONS_DIR = "{}/{}".format(BUCKET_PATH, EVALUATIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))
      LOCAL_EVALUATIONS_DIR = "{}/{}".format(EVALUATIONS_DIR, RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))

      CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))
      
      ##run the evaluation/prediction loop
      sucess,total_metrics,current_ckpt = \
              evaluation_loop(RUN_EVAL,
                              RUN_PREDICTION,
                              RESTORE_MODEL_NAME,
                              EVALUATE_WHILE_PREDICT,
                              dataset,
                              MODEL,
                              total_metrics,
                              current_ckpt,
                              DATA_SEQ_LENGTH,
                              current_data_folder_eval,
                              BERT_GCS_DIR,
                              USE_LATEST,
                              CHECKPOINT_STEP,
                              DATA_GCS_DIR_EVAL,
                              USING_SHARDS,
                              START_SHARD,
                              USING_PREDS,
                              PRED_NUM,
                              GCS_PREDICTIONS_DIR,
                              GCS_EVALUATIONS_DIR,
                              PREDICTIONS_FOLDER,
                              EVALUATIONS_FOLDER,
                              LOCAL_EVALUATIONS_DIR,
                              CONFIG_FILE)

      if not sucess:
        time.sleep(30)
        continue
      if RUN_EVAL:
        break
    if RN_EVAL:
      break
  if RUN_PREDICTION:
    break
if RUN_PREDICTION and RUN_EVAL:
  for evals_dir,metrics in total_metrics.items():
    print("Printing metrics for:",evals_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")



###Just one model

In [None]:
#@markdown whether or not to run evaluation
RUN_EVAL = False #@param {type:"boolean"}
#@markdown whether or not to run prediction in a seperate loop from evaluation (if using EVALUATE_WHILE_PREDICT, set to False)
RUN_PREDICTION = True #@param {type:"boolean"}
#@markdown if evaluating, whether or not to evaluate and write test results in the same loop; useful when amount of test data is very small and the time it takes to restart a loop is siginificant (the evalution loop itself will be slower due to writing tfevents) (if yes, prediction results will be written in the form of tfevent files into GCS, so use the notebook titled "mutformer processing and viewing finetuning results" to view them)
EVALUATE_WHILE_PREDICT =  False #@param {type:"boolean"}
#@markdown whether or not testing/evaluating data was generated in shards (for really large databases)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown if using shards, which shard index to start at (defualt 0 for first shard)
START_SHARD =   0#@param {type:"integer"}
#@markdown whether or not external data is being used
USING_PREDS = True #@param {type:"boolean"}
#@markdown if using external data, how many datapoints are included in total
PRED_NUM =   22#@param {type:"integer"}
#@markdown what folder to write predictions into 
PREDICTIONS_FOLDER = "added_preds_only_others" #@param {type:"string"}
#@markdown what folder to write evaluation results into 
EVALUATIONS_FOLDER = "added_preds_only_others" #@param {type:"string"}
#@markdown #####Note: If using EVALUATE_WHILE_PREDICT, prediction results must be read using the previously mentioned colab notebook, otherwise, predictions will be written directly as txts and will be directly accessible from google drive under the folder specified above
#@markdown \
#@markdown ###### whether to evaluate on the test set or the dev set ("test" or "dev") (test set will only run once, dev set will run continuously)
dataset = "test" #@param{type:"string"}
#@markdown which model id to evaluate (to determine which architecture to use)
MODEL_NAME="modified_large" #@param {type:"string"}
#@markdown ###### what sequence length to use
DATA_SEQ_LENGTH =  512 #@param
#@markdown ###### identifier for the model to use (replaces "xxx" from the variable "MODEL_DIR_format")
model_name_extension = "added_preds_only_others_512_32" #@param {type:"string"}
#@markdown ###### whether to use the latest checkpoint in the folder (set to false if an intermediate checkpoint should be used)
USE_LATEST = False #@param {type:"boolean"}
#@markdown ###### if not using the latest checkpoint, which checkpoint to use
CHECKPOINT_STEP =  7000#@param {type:"integer"}

if dataset=="test":
  evaluating_file = "test.tf_record"
  total_metrics = {}
elif dataset=="dev":
  evaluating_file = "eval.tf_record"
  total_metrics = None
else:
  raise Exception("only datasets supported are dev and test")

current_ckpt = "N/A"

while True:
  print("\n\n\nMODEL NAME:",MODEL_NAME,
      "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH)
  MODEL = name2model[MODEL_NAME]
  current_data_folder_eval= data_folder_format.replace("xxx",str(DATA_SEQ_LENGTH))
  RESTORE_MODEL_NAME = MODEL_DIR_format.replace("xxx",model_name_extension)

  BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, RESTORE_MODEL_NAME)  
  DATA_GCS_DIR_EVAL = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))

  GCS_PREDICTIONS_DIR = "{}/{}".format(BUCKET_PATH, PREDICTIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",model_name_extension))
  GCS_EVALUATIONS_DIR = "{}/{}".format(BUCKET_PATH, EVALUATIONS_FOLDER+"/"+RUN_NAME_format.replace("xxx",model_name_extension))
  LOCAL_EVALUATIONS_DIR = "{}/{}".format(EVALUATIONS_FOLDER, RUN_NAME_format.replace("xxx",model_name_extension))

  CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))

  ##run the evaluation/prediction loop
  sucess,total_metrics,current_ckpt = \
          evaluation_loop(RUN_EVAL,
                          RUN_PREDICTION,
                          RESTORE_MODEL_NAME,
                          EVALUATE_WHILE_PREDICT,
                          dataset,
                          MODEL,
                          total_metrics,
                          current_ckpt,
                          DATA_SEQ_LENGTH,
                          current_data_folder_eval,
                          BERT_GCS_DIR,
                          USE_LATEST,
                          CHECKPOINT_STEP,
                          DATA_GCS_DIR_EVAL,
                          USING_SHARDS,
                          START_SHARD,
                          USING_PREDS,
                          PRED_NUM,
                          GCS_PREDICTIONS_DIR,
                          GCS_EVALUATIONS_DIR,
                          PREDICTIONS_FOLDER,
                          EVALUATIONS_FOLDER,
                          LOCAL_EVALUATIONS_DIR,
                          CONFIG_FILE)

  if not sucess and RUN_PREDICTION:
    time.sleep(30)
    continue

  if RUN_PREDICTION:
    break
if RUN_PREDICTION and RUN_EVAL:
  for evals_dir,metrics in total_metrics.items():
    print("Printing metrics for:",evals_dir,"\n")
    for key,metric in metrics.items():
      print(key+":",metric)
    print("\n")

