#Finetuning Script

This notebook performs finetuning with varying models, batch sizes, and sequence lengths in order to find the best model. 

# Configure settings

In [None]:
#@markdown ## General Config
#@markdown If preferred, a GCP TPU/runtime can be used to run this notebook (instructions below)
GCP_RUNTIME = False #@param {type:"boolean"}
#@markdown How many TPU scores the TPU has: if using colab, NUM_TPU_CORES is 8.
NUM_TPU_CORES = 8 #@param {type:"number"}
#@markdown Which mode to use (a different mode means a different finetuning task): options are:
#@markdown * "MRPC" - paired sequence method
#@markdown * "MRPC_w_ex_data" - paired sequence method with external data
#@markdown * "RE" - single sequence method
#@markdown * "NER" - single sequence per residue prediction 
#@markdown 
#@markdown You can add more modes by creating a new processor and/or a new model_fn inside of the "mutformer_model_code" folder downloaded from github, then changing the corresponding code snippets in the code segment named "Authorize for GCS, Imports, and General Setup" (also edit the dropdown below).
MODE = "RE" #@param   ["MRPC_w_ex_data", "MRPC", "RE", "NER"]   {type:"string"} 
             ####      ^^^^^ dropdown list for all modes ^^^^^
#@markdown Name of the GCS bucket to use:
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://"+BUCKET_NAME
#@markdown Where in GCS the data needs to be loaded from (should be the same as the OUTPUT_DATA_DIR variable in the data generation script):
PROCESSED_DATA_DIR = "RE_finetune_update_loaded" #@param {type:"string"}
#@markdown Which folder to store the logs in (the LOGGING_DIR variable can be the same across all finetuning notebooks)
LOGGING_DIR = "MutFormer_finetuning_updated_logs" #@param {type:"string"}



#If running on a GCP runtime, follow these instructions to set it up:

###1) Create a VM from the GCP website
###2) Open a command prompt on your computer and perform the following steps"
To ssh into the VM, run:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Note: Make sure the port above matches the port below (in this case it's 8888)
\
\
In the new command prompt that popped out, either run each of the commands below individually, or copy and paste the one liner below:
```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
One command:
```
sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
###3) In this notebook, click the "connect to local runtime" option under the connect button, and copy and paste the link outputted by command prompt with "locahost: ..."
###4) Finally, run this code segment, which creates a TPU


In [None]:
GCE_PROJECT_NAME = "genome-project-319100" #@param {type:"string"}
TPU_ZONE = "us-central1-f" #@param {type:"string"}
TPU_NAME = "mutformer-tpu" #@param {type:"string"}

!gcloud alpha compute tpus create $TPU_NAME --accelerator-type=tpu-v2 --version=1.15.5 --zone=$TPU_ZONE ##create new TPU

!gsutil iam ch serviceAccount:`gcloud alpha compute tpus describe $TPU_NAME | grep serviceAccount | cut -d' ' -f2`:admin gs://theodore_jiang && echo 'Successfully set permissions!' ##give TPU access to GCS

#Clone the MutFormer repo

In [None]:
if GCP_RUNTIME:
  !sudo apt-get -y install git
#@markdown Where to clone the repo into:
REPO_DESTINATION_PATH = "mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

Cloning into 'mutformer'...
remote: Enumerating objects: 614, done.[K
remote: Counting objects: 100% (415/415), done.[K
remote: Compressing objects: 100% (335/335), done.[K
remote: Total 614 (delta 299), reused 111 (delta 78), pack-reused 199[K
Receiving objects: 100% (614/614), 2.14 MiB | 12.87 MiB/s, done.
Resolving deltas: 100% (410/410), done.


#Authorize for GCS, Imports, and General Setup

In [None]:
if not GCP_RUNTIME:
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

import sys
import json
import random
import logging
import tensorflow.compat.v1 as tf
import time
import importlib
import os
import shutil

if REPO_DESTINATION_PATH == "mutformer":
  if os.path.exists("mutformer_code"):
    shutil.rmtree("mutformer_code")
  shutil.copytree(REPO_DESTINATION_PATH,"mutformer_code")
  REPO_DESTINATION_PATH = "mutformer_code"
if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic  #### <<<<< if you added more modes, change these imports to import the correct processors, 
from mutformer.modeling import BertModel,BertModelModified                                        #### <<<<< correct training scripts (i.e. run_classifier and run_ner_for_pathogenic), and
from mutformer.run_classifier import MrpcProcessor,REProcessor,MrpcWithExDataProcessor            #### <<<<< correct model classes
from mutformer.run_ner_for_pathogenic import NERProcessor  

##reload modules in case that's needed
modules2reload = [modeling, 
                  optimization, 
                  tokenization,
                  run_classifier,
                  run_ner_for_pathogenic]
for module in modules2reload:
    importlib.reload(module)

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#@markdown Whether or not to write logs to a file
DO_FILE_LOGGING = True #@param {type:"boolean"}
if DO_FILE_LOGGING:
  #@markdown * If using file logging, what path to write logs to
  FILE_LOGGING_PATH = 'file_logging/spam.log' #@param {type:"string"}
  if not os.path.exists("/".join(FILE_LOGGING_PATH.split("/")[:-1])):
    os.makedirs("/".join(FILE_LOGGING_PATH.split("/")[:-1]))
  fh = logging.FileHandler(FILE_LOGGING_PATH)
  fh.setLevel(logging.INFO)
  fh.setFormatter(formatter)
  log.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)


if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    ##upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')


if MODE=="MRPC":      ####       vvvvv if you added more modes, change this part to set the processors and training scripts correctly vvvvv
  processor = MrpcProcessor()
  script = run_classifier
  USING_EX_DATA = False
elif MODE=="MRPC_w_ex_data":
  processor = MrpcWithExDataProcessor()
  script = run_classifier
  USING_EX_DATA = True
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
  USING_EX_DATA = False
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
  USING_EX_DATA = False
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\",\"MRPC_w_ex_data\" \"RE\",\"NER\"].")
label_list = processor.get_labels()
                      ####       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


TensorFlow 1.x selected.
Authorize for GCS:
Authorize done



2022-01-07 03:33:47,372 - tensorflow - INFO - Using TPU runtime
2022-01-07 03:33:47,396 - tensorflow - INFO - TPU address is grpc://10.93.7.66:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



# Run Finetuning

This following section will perform finetuning tests for testing different models' performance with different parameters.

###General definitions

In [None]:
def latest_checkpoint(dir):
  cmd = "gsutil ls "+dir
  files = !{cmd}
  for file in files:
    if "model.ckpt" in file:
      return file.replace("."+file.split(".")[-1],"")

def training_loop(BATCH_SIZE,
                  RESUMING,
                  PLANNED_TOTAL_STEPS,
                  DECAY_PER_STEP,
                  MAX_SEQ_LENGTH,
                  MODEL_NAME,
                  MODEL,
                  INIT_CHECKPOINT_DIR,
                  GCS_OUTPUT_MODEL_DIR,
                  DATA_GCS_DIR,
                  USING_SHARDS,
                  START_SHARD,
                  USING_EX_DATA,
                  EX_DATA_NUM,
                  GCS_LOGGING_DIR,
                  CONFIG_FILE):
  
  RESTORE_CHECKPOINT = None if not RESUMING else tf.train.latest_checkpoint(GCS_OUTPUT_MODEL_DIR)
  if not RESUMING:
    cmd = "gsutil -m rm -r "+GCS_OUTPUT_MODEL_DIR
    !{cmd}
  tf.logging.info("Using data from: "+DATA_GCS_DIR)

  try: 
    INIT_CHECKPOINT = tf.train.latest_checkpoint(INIT_CHECKPOINT_DIR)
  except:
    INIT_CHECKPOINT = latest_checkpoint(INIT_CHECKPOINT_DIR)
  tf.logging.info("init checkpoint:"+str(INIT_CHECKPOINT)+", restore/save checkpont:"+str(RESTORE_CHECKPOINT))

  config = modeling.BertConfig.from_json_file(CONFIG_FILE)
  if not tf.io.gfile.exists(GCS_OUTPUT_MODEL_DIR+"/config.json"):
    tf.io.gfile.copy(CONFIG_FILE,GCS_OUTPUT_MODEL_DIR+"/config.json")

  model_fn = script.model_fn_builder(
      bert_config=config,
      logging_dir=GCS_LOGGING_DIR,
      num_labels=len(label_list),
      init_checkpoint=INIT_CHECKPOINT,
      restore_checkpoint=RESTORE_CHECKPOINT,
      init_learning_rate=INIT_LEARNING_RATE,
      decay_per_step=DECAY_PER_STEP,
      num_warmup_steps=NUM_WARMUP_STEPS,
      use_tpu=True,
      use_one_hot_embeddings=True,
      bert=MODEL,
      weight_decay=WEIGHT_DECAY,
      epsilon=1e-6, ##epsilon is used to prevent dividing by zero
      clip_grads=False,
      using_ex_data=USING_EX_DATA)

  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=GCS_OUTPUT_MODEL_DIR,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      keep_checkpoint_max=KEEP_N_CHECKPOINTS_AT_A_TIME,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=BATCH_SIZE)
  
  train_file_name = "train.tf_record"
  train_file = os.path.join(DATA_GCS_DIR, train_file_name)

  if USING_SHARDS:
    shards_folder = DATA_GCS_DIR
    input_file = os.path.join(DATA_GCS_DIR, train_file_name)
    import re
    file_name = input_file.split("/")[-1]
    shards = [shards_folder + "/" + file for file in tf.io.gfile.listdir(shards_folder) if
              re.match(file_name + "_\d+", file)]
    shards = sorted(shards,key=lambda shard:int(shard.split("_")[-1]))[START_SHARD:]
  else:
    shards = [train_file]

  if USING_SHARDS:
    tf.logging.info("\nUSING SHARDs:")
    for shard in shards:
      tf.logging.info(str(shard))
    tf.logging.info("\n")

  tf.logging.info("***** Running training *****")
  tf.logging.info("  Batch size = %d", BATCH_SIZE)
  for n,shard in enumerate(shards):
      train_input_fn = script.file_based_input_fn_builder(
          input_file=shard,
          seq_length=MAX_SEQ_LENGTH,
          is_training=True,
          drop_remainder=True,
          pred_num=EX_DATA_NUM if USING_EX_DATA else None)
      estimator.train(input_fn=train_input_fn, max_steps=PLANNED_TOTAL_STEPS)



###Training Loops

Following are three code segments to run. These options are:
1. Model/sequence length: different model architectures will be tested using a fixed batch size on data of varying sequence lengths \
2. Sequence length/batch size: one model architecture will be tested using varying batch sizes on data of varying sequence lengths\
3. One model: one model architecture will be tested using a fixed batch size on a fixed set of data of a given sequence length

Note: During training, evaluation results on the training dataset will be written into GCS. To view these results, use the colab notebook titled "mutformer processing and viewing finetuning results."

####Model/sequence length

In [None]:
#@markdown ### IO config
#@markdown Folder in GCS where the pretrained models needs to be loaded from:
INIT_MODEL_DIR = "pretrained_models" #@param {type:"string"}
#@markdown Folder for where to save the finetuned model
OUTPUT_MODEL_DIR = "bert_model_re_mn_sl_try8" #@param {type:"string"}
#@markdown Which folder inside of LOGGING_DIR to store the logs in
RUN_NAME = "RE_updated_mn_sl_try8" #@param {type:"string"}
#@markdown \
#@markdown 
#@markdown 
#@markdown ### Training procedure config
#@markdown Train batch size to use:
BATCH_SIZE =  16#@param {type:"integer"}
#@markdown The training loop will loop through a list of pretrained models and a list of sequence lengths, training a model for each combination of pretrained model and sequence length
#@markdown * List of pretrained models to load (should indicate the names of the model folders inside the specified INIT_MODEL_DIR
MODELS = ["MutBERT8L","MutBERT10L","MutFormer8L"] #@param
#@markdown * List of model architectures for each model in the "MODELS" list defined in the entry above: each position in this list must correctly indicate the model architecture of its corresponding model folder in the list "MODELS" (BertModel indicates the original BERT, BertModelModified indicates MutFormer's architecture).
MODEL_ARCHITECTURES = [BertModel,BertModel,BertModelModified] #@param
#@markdown * List of sequence lengths to test
MAX_SEQ_LENGTHS = [512,1024,256,128,64] #@param
#@markdown Whether or not to resume training from a previous checkpoint; if no, always train from scratch
RESUMING = False #@param {type:"boolean"}
#@markdown Whether or not data was generated in shards (for really large databases)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown If training data was generated in shards, which shard index to start at (defualt 0 for first shard)
START_SHARD = 0 #@param {type:"integer"}
#@markdown Training uses a linear learning rate.
#@markdown * Start learning rate: training will start with this learning rate on the step that learning rate warmup is complete
INIT_LEARNING_RATE =  1e-5 #@param {type:"number"}
#@markdown * End learning rate: training will alter the learning rate every step linearly so that it finishes with this learning rate on the last step.
END_LEARNING_RATE = 1e-6 #@param {type:"number"}
#@markdown How many steps during training to perform learning rate warmup for (start from learning rate 0 and increase to INIT_LEARNING_RATE): Set to 0 for no warmup.
NUM_WARMUP_STEPS =  0#@param {type:"integer"}
#@markdown What weight decay value to use (MutFormer uses 0.01; a higher weight decay is more resistant to exploding gradients, but also limits the model's ability to learn)
WEIGHT_DECAY = 0.01 #@param {type:"number"}
#@markdown Save a checkpoint every this amount of steps:
SAVE_CHECKPOINTS_STEPS =   500#@param {type:"integer"}
#@markdown TPUEstimator will keep this number of checkpoints at a time; older checkpoints will all be deleted:
KEEP_N_CHECKPOINTS_AT_A_TIME =  10#@param {type:"integer"}
#@markdown How many sequences should the model train on before stopping:
PLANNED_TOTAL_SEQUENCES_SEEN =  256000 #@param {type:"number"}
#@markdown How many steps should the model train for before stopping (number of total sequences trained on will depend on the batch size used). NOTE: PLANNED_TOTAL_STEPS will override PLANNED_TOTAL_SEQUENCES_SEEN; if using PLANNED_TOTAL_SEQUENCES_SEEN, set PLANNED_TOTAL_STEPS to -1 (PLANNED TOTAL STEPS will be based on the train batch size used, which can be specified later)
PLANNED_TOTAL_STEPS =  4000#@param {type:"number"}


PLANNED_TOTAL_STEPS = PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS != -1 else PLANNED_TOTAL_SEQUENCES_SEEN//BATCH_SIZE
DECAY_PER_STEP = (END_LEARNING_RATE-INIT_LEARNING_RATE)/(PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS!=-1 else PLANNED_TOTAL_SEQUENCES_SEEN/BATCH_SIZE) 

DATA_INFOS = [["N/A" for MODEL_NAME in MODELS]            ##create an empty 2D list to store all
              for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS]      ##the data info dictionaries
                                                                                   
for M,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):
  for m,MODEL_NAME in enumerate(MODELS):
    tf.logging.info("\n\n\nMODEL NAME:"+MODEL_NAME+
          "\nINPUT MAX SEQ LENGTH:"+str(MAX_SEQ_LENGTH))


    MODEL = MODEL_ARCHITECTURES[m]
    INIT_CHECKPOINT_DIR = BUCKET_PATH+"/"+INIT_MODEL_DIR+"/"+MODEL_NAME
    GCS_OUTPUT_MODEL_DIR = BUCKET_PATH+"/"+OUTPUT_MODEL_DIR+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)
    DATA_GCS_DIR = BUCKET_PATH+"/"+PROCESSED_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)
    
    GCS_LOGGING_DIR = BUCKET_PATH+"/"+LOGGING_DIR+"/"+RUN_NAME+"/mn_"+MODEL_NAME+"_sl_"+str(MAX_SEQ_LENGTH)

    CONFIG_FILE = BUCKET_PATH+"/"+INIT_MODEL_DIR+"/"+MODEL_NAME+"/config.json"
    
    if DATA_INFOS[M][m] == "N/A":
      DATA_INFOS[M][m] = json.load(tf.gfile.Open(DATA_GCS_DIR+"/info.json"))
    
    EX_DATA_NUM = DATA_INFOS[M][m]["ex_data_num"] if USING_EX_DATA else 0
    
    training_loop(BATCH_SIZE,
                  RESUMING,
                  PLANNED_TOTAL_STEPS,
                  DECAY_PER_STEP,
                  MAX_SEQ_LENGTH,
                  MODEL_NAME,
                  MODEL,
                  INIT_CHECKPOINT_DIR,
                  GCS_OUTPUT_MODEL_DIR,
                  DATA_GCS_DIR,
                  USING_SHARDS,
                  START_SHARD,
                  USING_EX_DATA,
                  EX_DATA_NUM,
                  GCS_LOGGING_DIR,
                  CONFIG_FILE)
  
4  

####Batch size/sequence length

In [None]:
#@markdown ### IO config
#@markdown Folder in GCS where the pretrained models needs to be loaded from:
INIT_MODEL_DIR = "" #@param {type:"string"}
#@markdown Name of the folder to the pretrained model to load from inside INIT_MODEL_DIR
MODEL_NAME="bert_model_modified_large" #@param {type:"string"}
#@markdown Model architecture to use BertModel indicates the original BERT, BertModelModified indicates MutFormer's architecture
MODEL_ARCHITECTURE = BertModelModified #@param
#@markdown Folder for where to save the finetuned model
OUTPUT_MODEL_DIR = "bert_model_mrpc_adding_preds" #@param {type:"string"}
#@markdown Which folder inside of LOGGING_DIR to store the logs in
RUN_NAME = "MRPC_adding_preds_w_mutformer12L" #@param {type:"string"}
#@markdown \
#@markdown 
#@markdown 
#@markdown ### Training procedure config
#@markdown The training loop will loop through a list of batch sizes and a list of sequence lengths, training a model for each combination of batch size and sequence length
#@markdown * List of batch sizes to test
BATCH_SIZES = [64] #@param
#@markdown * List of sequence lengths to test
MAX_SEQ_LENGTHS = [1024] #@param
#@markdown Whether or not to resume training from a previous checkpoint; if no, always train from scratch
RESUMING = False #@param {type:"boolean"}
#@markdown Whether or not data was generated in shards (for really large databases)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown If training data was generated in shards, which shard index to start at (defualt 0 for first shard)
START_SHARD = 0 #@param {type:"integer"}
#@markdown Training uses a linear learning rate.
#@markdown * Start learning rate: training will start with this learning rate on the step that learning rate warmup is complete
INIT_LEARNING_RATE =  1e-5 #@param {type:"number"}
#@markdown * End learning rate: training will alter the learning rate every step linearly so that it finishes with this learning rate on the last step.
END_LEARNING_RATE = 5e-7 #@param {type:"number"}
#@markdown How many steps during training to perform learning rate warmup for (start from learning rate 0 and increase to INIT_LEARNING_RATE): Set to 0 for no warmup.
NUM_WARMUP_STEPS = 10 #@param {type:"integer"}
#@markdown What weight decay value to use (MutFormer uses 0.01; a higher weight decay is more resistant to exploding gradients, but also limits the model's ability to learn)
WEIGHT_DECAY = 0.01 #@param {type:"number"}
#@markdown Save a checkpoint every this amount of steps:
SAVE_CHECKPOINTS_STEPS =  1000 #@param {type:"integer"}
#@markdown TPUEstimator will keep this number of checkpoints at a time; older checkpoints will all be deleted:
KEEP_N_CHECKPOINTS_AT_A_TIME =  10#@param {type:"integer"}
#@markdown How many sequences should the model train on before stopping:
PLANNED_TOTAL_SEQUENCES_SEEN =  2e5 #@param {type:"number"}
#@markdown How many steps should the model train for before stopping (number of total sequences trained on will depend on the batch size used). NOTE: PLANNED_TOTAL_STEPS will override PLANNED_TOTAL_SEQUENCES_SEEN; if using PLANNED_TOTAL_SEQUENCES_SEEN, set PLANNED_TOTAL_STEPS to -1 (PLANNED TOTAL STEPS will be based on the train batch size used, which can be specified later)
PLANNED_TOTAL_STEPS = 8000 #@param {type:"number"}


PLANNED_TOTAL_STEPS = PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS != -1 else PLANNED_TOTAL_SEQUENCES_SEEN//BATCH_SIZE
DECAY_PER_STEP = (END_LEARNING_RATE-INIT_LEARNING_RATE)/(PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS!=-1 else PLANNED_TOTAL_SEQUENCES_SEEN/BATCH_SIZE) 

DATA_INFOS = [["N/A" for BATCH_SIZE in BATCH_SIZES]            ##create an empty 2D list to store all
              for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS]           ##the data info dictionaries

for M,MAX_SEQ_LENGTH in enumerate(MAX_SEQ_LENGTHS):
    for B,BATCH_SIZE in enumerate(BATCH_SIZES):
        tf.logging.info("\nINPUT MAX SEQ LENGTH:"+str(MAX_SEQ_LENGTH)+
              "\nTRAIN_BATCH_SIZE:"+str(BATCH_SIZE)+"\n\n\n")
       
        MODEL = MODEL_ARCHITECTURE
        INIT_CHECKPOINT_DIR = BUCKET_PATH+"/"+INIT_MODEL_DIR+"/"+MODEL_NAME
        GCS_OUTPUT_MODEL_DIR = BUCKET_PATH+"/"+OUTPUT_MODEL_DIR+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)
        DATA_GCS_DIR = BUCKET_PATH+"/"+PROCESSED_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)
      
        GCS_LOGGING_DIR = BUCKET_PATH+"/"+LOGGING_DIR+"/"+RUN_NAME+"/bs_"+str(BATCH_SIZE)+"_sl_"+str(MAX_SEQ_LENGTH)
        
        CONFIG_FILE = BUCKET_PATH+"/"+INIT_MODEL_DIR+"/"+MODEL_NAME+"/config.json"
        
        if DATA_INFOS[M][B] == "N/A":
          DATA_INFOS[M][B] = json.load(tf.gfile.Open(DATA_GCS_DIR+"/info.json"))
        
        EX_DATA_NUM = DATA_INFOS[M][B]["ex_data_num"] if USING_EX_DATA else 0

        training_loop(BATCH_SIZE,
                      RESUMING,
                      PLANNED_TOTAL_STEPS,
                      DECAY_PER_STEP,
                      MAX_SEQ_LENGTH,
                      MODEL_NAME,
                      MODEL,
                      INIT_CHECKPOINT_DIR,
                      GCS_OUTPUT_MODEL_DIR,
                      DATA_GCS_DIR,
                      USING_SHARDS,
                      START_SHARD,
                      USING_EX_DATA,
                      EX_DATA_NUM,
                      GCS_LOGGING_DIR,
                      CONFIG_FILE)

####One model

In [None]:
#@markdown ### IO config
#@markdown Folder in GCS where the pretrained models needs to be loaded from:
INIT_MODEL_DIR = "pretrained_models" #@param {type:"string"}
#@markdown Name of the folder to the pretrained model to load from inside INIT_MODEL_DIR
MODEL_NAME="MutFormer12L" #@param {type:"string"}
#@markdown Model architecture to use BertModel indicates the original BERT, BertModelModified indicates MutFormer's architecture
MODEL_ARCHITECTURE = BertModelModified #@param
#@markdown Folder for where to save the finetuned model
OUTPUT_MODEL_DIR = "bert_model_mrpc_just_others_12L_try7" #@param {type:"string"}
#@markdown Which folder inside of LOGGING_DIR to store the logs in
RUN_NAME = "MRPC_just_others_12L_try7" #@param {type:"string"}
#@markdown \
#@markdown 
#@markdown 
#@markdown ### Training procedure config
#@markdown Batch size to use
BATCH_SIZE = 32 #@param {type:"integer"}
#@markdown Maximum sequence length to use
MAX_SEQ_LENGTH = 512 #@param {type:"integer"}
#@markdown Whether or not to resume training from a previous checkpoint; if no, always train from scratch
RESUMING = True #@param {type:"boolean"}
#@markdown Whether or not data was generated in shards (for really large databases)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown * If using shards, which shard index to start at (defualt 0 for first shard)
START_SHARD = 0 #@param {type:"integer"}
#@markdown Training uses a linear learning rate.
#@markdown * Start learning rate: training will start with this learning rate on the step that learning rate warmup is complete
INIT_LEARNING_RATE =  1e-5 #@param {type:"number"}
#@markdown * End learning rate: training will alter the learning rate every step linearly so that it finishes with this learning rate on the last step.
END_LEARNING_RATE =  1e-6#@param {type:"number"}
#@markdown How many steps during training to perform learning rate warmup for (start from learning rate 0 and increase to INIT_LEARNING_RATE): Set to 0 for no warmup.
NUM_WARMUP_STEPS = 10 #@param {type:"integer"}
#@markdown What weight decay value to use (MutFormer uses 0.01; a higher weight decay is more resistant to exploding gradients, but also limits the model's ability to learn)
WEIGHT_DECAY = 0.01 #@param {type:"number"}
#@markdown Save a checkpoint every this amount of steps:
SAVE_CHECKPOINTS_STEPS =  1000 #@param {type:"integer"}
#@markdown TPUEstimator will keep this number of checkpoints at a time; older checkpoints will all be deleted:
KEEP_N_CHECKPOINTS_AT_A_TIME =  10 #@param {type:"integer"}
#@markdown How many sequences should the model train on before stopping:
PLANNED_TOTAL_SEQUENCES_SEEN =  2e5 #@param {type:"number"}
#@markdown How many steps should the model train for before stopping (number of total sequences trained on will depend on the batch size used). NOTE: PLANNED_TOTAL_STEPS will override PLANNED_TOTAL_SEQUENCES_SEEN; if using PLANNED_TOTAL_SEQUENCES_SEEN, set PLANNED_TOTAL_STEPS to -1 (PLANNED TOTAL STEPS will be based on the train batch size used, which can be specified later)
PLANNED_TOTAL_STEPS = 10000 #@param {type:"number"}


PLANNED_TOTAL_STEPS = PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS != -1 else PLANNED_TOTAL_SEQUENCES_SEEN//BATCH_SIZE
DECAY_PER_STEP = (END_LEARNING_RATE-INIT_LEARNING_RATE)/(PLANNED_TOTAL_STEPS if PLANNED_TOTAL_STEPS!=-1 else PLANNED_TOTAL_SEQUENCES_SEEN/BATCH_SIZE) 


MODEL = MODEL_ARCHITECTURE
INIT_CHECKPOINT_DIR = BUCKET_PATH+"/"+INIT_MODEL_DIR+"/"+MODEL_NAME
GCS_OUTPUT_MODEL_DIR = BUCKET_PATH+"/"+OUTPUT_MODEL_DIR
DATA_GCS_DIR = BUCKET_PATH+"/"+PROCESSED_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)

GCS_LOGGING_DIR = BUCKET_PATH+"/"+LOGGING_DIR+"/"+RUN_NAME

CONFIG_FILE = BUCKET_PATH+"/"+INIT_MODEL_DIR+"/"+MODEL_NAME+"/config.json"

DATA_INFO = json.load(tf.gfile.Open(DATA_GCS_DIR+"/info.json"))   ##get the data info dictionary
EX_DATA_NUM = DATA_INFO["ex_data_num"] if USING_EX_DATA else 0

training_loop(BATCH_SIZE,
              RESUMING,
              PLANNED_TOTAL_STEPS,
              DECAY_PER_STEP,
              MAX_SEQ_LENGTH,
              MODEL_NAME,
              MODEL,
              INIT_CHECKPOINT_DIR,
              GCS_OUTPUT_MODEL_DIR,
              DATA_GCS_DIR,
              USING_SHARDS,
              START_SHARD,
              USING_EX_DATA,
              EX_DATA_NUM,
              GCS_LOGGING_DIR,
              CONFIG_FILE)