Performs finetuning with varying batch sizes, models, and sequence lengths in order to find the best model

# Configure settings

In [None]:
#@markdown ## General Config
#@markdown Which task to perform: options are "MRPC" for paired sequence method, "RE" for single sequence method, or "NER" for single sequance per residue prediction (if you add more modes make sure to change the corresponding code segments)
MODE = "MRPC" #@param {type:"string"}
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
#@markdown ###### For if multiple models fine tuned: xxx is the placeholder for the individual model identifier (if only one is being evaluated replace xx with the actual name of the model)
#@markdown \
#@markdown folder for where to save the finetuned model
MODEL_DIR_format = "bert_model_mrpc_xxx" #@param {type:"string"}
#@markdown folder for the pretrained model
INIT_MODEL_DIR_format = "bert_model_xxx" #@param {type:"string"}
DATA_DIR_format = "MRPC_xxx" #@param {type:"string"}
LOGGING_DIR = "mrpc_loss_spam_model_comparison_final" #@param {type:"string"}
RUN_NAME_format = "MRPC_xxx" #@param {type:"string"}

#@markdown ### Training procedure config
INIT_LEARNING_RATE =  1e-5 #@param {type:"number"}
END_LEARNING_RATE = 1e-6 #@param {type:"number"}
SAVE_CHECKPOINTS_STEPS =  100 #@param {type:"integer"}
NUM_TPU_CORES = 8 #@param {type:"number"}
PLANNED_TOTAL_SEQUENCES_SEEN =  2e5 #@param {type:"number"}
#@markdown PLANNED_TOTAL_STEPS will override PLANNED_TOTAL_SEQUENCES_SEEN; if using PLANNED_TOTAL_SEQUENCES_SEEN, set PLANNED_TOTAL_STEPS to -1 (PLANNED TOTAL STEPS will be based on the train batch size used)
PLANNED_TOTAL_STEPS = 10000 #@param {type:"number"}


#Mount Drive

In [None]:
from google.colab import drive,auth
import os
import shutil
!fusermount -u /content/drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)
DRIVE_PATH = "/content/drive/My Drive"


#Clone the repo

In [None]:
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://tianqitheodorejiang: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@github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

#Imports

In [None]:
%tensorflow_version 1.x

import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil
from google.colab import auth

print("Authorize for GCS:")
auth.authenticate_user()
print("Authorize done")


print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_classifier import MrpcProcessor,REProcessor ##change this part if you add more modes--
from mutformer.run_ner_for_pathogenic import NERProcessor      ##--
  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#@markdown ###### Whether or not to write logs to a file
DO_FILE_LOGGING = True #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown ###### If using file logging, what path to write logs to
  FILE_LOGGING_PATH = 'file_logging/spam.log' #@param {type:"string"}
  if not os.path.exists("/".join(FILE_LOGGING_PATH.split("/")[:-1])):
    os.makedirs("/".join(FILE_LOGGING_PATH.split("/")[:-1]))
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

# create formatter and add it to the handlers
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

log.handlers = [fh,ch]

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')


if MODE=="MRPC": ##change this part if you added more modes
  processor = MrpcProcessor()
  script = run_classifier
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\", \"RE\",\"NER\"].")
label_list = processor.get_labels()

# Run Training

###General definitions

In [None]:
BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
name2model = {
    "modified_large":BertModelModified,
    "modified_medium":BertModelModified,
    "modified":BertModelModified,
    "orig":BertModel,
    "large":BertModel
}

def latest_checkpoint(dir):
  cmd = "gsutil ls "+dir
  files = !{cmd}
  for file in files:
    if "model.ckpt" in file:
      return file.replace("."+file.split(".")[-1],"")

def training_loop(BATCH_SIZE,
                  RESUMING,
                  PLANNED_TOTAL_STEPS,
                  DECAY_PER_STEP,
                  DATA_SEQ_LENGTH,
                  MODEL_NAME,
                  MODEL,
                  INIT_CHECKPOINT_DIR,
                  BERT_GCS_DIR,
                  DATA_GCS_DIR,
                  GCS_LOGGING_DIR,
                  CONFIG_FILE):
  RESTORE_CHECKPOINT = None if not RESUMING else tf.train.latest_checkpoint(BERT_GCS_DIR)
  if not RESUMING:
    cmd = "gsutil -m rm -r "+BERT_GCS_DIR
    !{cmd}

  ## if using a directory with only a single checkpoint and no "checkpoint" file, 
  ## tf.train.latest_checkpoint will not work, so get fold name manually via latest_checkpoint(dir)
  try: 
    INIT_CHECKPOINT = tf.train.latest_checkpoint(INIT_CHECKPOINT_DIR)
  except:
    INIT_CHECKPOINT = latest_checkpoint(INIT_CHECKPOINT_DIR)
  print("init checkpoint:",INIT_CHECKPOINT,"restore/save checkpont",RESTORE_CHECKPOINT)

  config = modeling.BertConfig.from_json_file(CONFIG_FILE)
  config.hidden_dropout_prob = 0.1
  config.attention_probs_dropout_prob = 0.1

  model_fn = script.model_fn_builder(
      bert_config=config,
      logging_dir=GCS_LOGGING_DIR,
      num_labels=len(label_list),
      init_checkpoint=INIT_CHECKPOINT,
      restore_checkpoint=RESTORE_CHECKPOINT,
      init_learning_rate=INIT_LEARNING_RATE,
      decay_per_step=DECAY_PER_STEP,
      num_warmup_steps=10,
      use_tpu=True,
      use_one_hot_embeddings=True,
      bert=MODEL,
      weight_decay=0.01,
      epsilon=1e-6,
      clip_grads=False)

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=BERT_GCS_DIR,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=BATCH_SIZE)
    
  train_file = os.path.join(DATA_GCS_DIR, "train.tf_record")

  tf.logging.info("***** Running training *****")
  tf.logging.info("  Batch size = %d", BATCH_SIZE)
  train_input_fn = script.file_based_input_fn_builder(
      input_file=train_file,
      seq_length=DATA_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)
  
  ##writing data to drive so that the parallel eval script can know what to do
  open(DRIVE_PATH+"/finetuning_run_paired_model.txt","w+").write(MODEL_NAME)
  open(DRIVE_PATH+"/finetuning_run_paired_seq_length.txt","w+").write(str(DATA_SEQ_LENGTH))
  open(DRIVE_PATH+"/finetuning_run_paired_batch_size.txt","w+").write(str(BATCH_SIZE))
  estimator.train(input_fn=train_input_fn, max_steps=PLANNED_TOTAL_STEPS)



####Model/sequence length

In [None]:
#@markdown train batch size to use
BATCH_SIZE=16 #@param
#@markdown list of models to test
models = ["modified_medium","modified_large"] #@param
#@markdown list of maximum sequence lengths to test
lengths = [256,512,1024] #@param
#@markdown whether or not to resume training from a previous finetuned checkpoint; if no, always train from pretrained model
RESUMING = False #@param {type:"boolean"}

PLANNED_TOTAL_STEPS = PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS != -1 else PLANNED_TOTAL_SEQUENCES_SEEN//BATCH_SIZE
DECAY_PER_STEP = (END_LEARNING_RATE-INIT_LEARNING_RATE)/(PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS!=-1 else PLANNED_TOTAL_SEQUENCES_SEEN/TRAIN_BATCH_SIZE) 

for DATA_SEQ_LENGTH in lengths:
  for MODEL_NAME in models:
    print("\n\n\nMODEL NAME:",MODEL_NAME,
          "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH,
          "\nTRAIN_BATCH_SIZE:",BATCH_SIZE,"\n\n\n")

    MODEL = name2model[MODEL_NAME]
    INIT_CHECKPOINT_DIR = "{}/{}".format(BUCKET_PATH, INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))
    BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))
    DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))
    
    GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)))

    CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))

    training_loop(BATCH_SIZE,
                  RESUMING,
                  PLANNED_TOTAL_STEPS,
                  DECAY_PER_STEP,
                  DATA_SEQ_LENGTH,
                  MODEL_NAME,
                  MODEL,
                  INIT_CHECKPOINT_DIR,
                  BERT_GCS_DIR,
                  DATA_GCS_DIR,
                  GCS_LOGGING_DIR,
                  CONFIG_FILE)
    
  
  

####Batch size/sequence length

In [None]:
#@markdown list of batch sizes to test
batch_sizes = [32,16,64] #@param
#@markdown list of maximum sequence lengths to test
lengths = [256,512,1024] #@param
#@markdown model to use
MODEL_NAME="modified_large" #@param {type:"string"}
#@markdown whether or not to resume training from a previous finetuned checkpoint; if no, always train from pretrained model
RESUMING = True #@param {type:"boolean"}

BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
PLANNED_TOTAL_STEPS = PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS != -1 else PLANNED_TOTAL_SEQUENCES_SEEN//BATCH_SIZE
DECAY_PER_STEP = (END_LEARNING_RATE-INIT_LEARNING_RATE)/(PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS!=-1 else PLANNED_TOTAL_SEQUENCES_SEEN/TRAIN_BATCH_SIZE) 

for DATA_SEQ_LENGTH in lengths:
    for BATCH_SIZE in batch_sizes:
        print("\n\n\nMODEL NAME:",MODEL_NAME,
              "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH,
              "\nTRAIN_BATCH_SIZE:",BATCH_SIZE,"\n\n\n")
       
        MODEL = name2model[MODEL_NAME]
        INIT_CHECKPOINT_DIR = "{}/{}".format(BUCKET_PATH, INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))
        BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))
        DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, DATA_DIR_format.replace("xxx",str(DATA_SEQ_LENGTH)))
      
        GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME_format.replace("xxx",MODEL_NAME+"_"+str(DATA_SEQ_LENGTH)+"_"+str(BATCH_SIZE)))
        
        CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format.replace("xxx",MODEL_NAME))

        training_loop(BATCH_SIZE,
                      RESUMING,
                      PLANNED_TOTAL_STEPS,
                      DECAY_PER_STEP,
                      DATA_SEQ_LENGTH,
                      MODEL_NAME,
                      MODEL,
                      INIT_CHECKPOINT_DIR,
                      BERT_GCS_DIR,
                      DATA_GCS_DIR,
                      GCS_LOGGING_DIR,
                      CONFIG_FILE)

###Train a single model

In [None]:
#@markdown batch size to use
BATCH_SIZE = 32 #@param
#@markdown maximum sequence length to use
DATA_SEQ_LENGTH = 512 #@param
#@markdown model to use
MODEL_NAME="modified" #@param {type:"string"}
#@markdown whether or not to resume training from a previous checkpoint; if no, always train from scratch
RESUMING = False #@param {type:"boolean"}

print("\n\n\nMODEL NAME:",MODEL_NAME,
      "\nINPUT MAX SEQ LENGTH:",DATA_SEQ_LENGTH,
      "\nTRAIN_BATCH_SIZE:",BATCH_SIZE,"\n\n\n")

MODEL = name2model[MODEL_NAME]
INIT_CHECKPOINT_DIR = "{}/{}".format(BUCKET_PATH, INIT_MODEL_DIR_format)
BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR_format)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, DATA_DIR_format)

GCS_LOGGING_DIR = "{}/{}".format(BUCKET_PATH, LOGGING_DIR+"/"+RUN_NAME_format)

CONFIG_FILE = "{}/config.json".format(BUCKET_PATH+"/"+INIT_MODEL_DIR_format)

training_loop(BATCH_SIZE,
              RESUMING,
              PLANNED_TOTAL_STEPS,
              DECAY_PER_STEP,
              DATA_SEQ_LENGTH,
              MODEL_NAME,
              MODEL,
              INIT_CHECKPOINT_DIR,
              BERT_GCS_DIR,
              DATA_GCS_DIR,
              GCS_LOGGING_DIR,
              CONFIG_FILE)