# Configure settings

In [None]:
#@markdown ## General Config
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
#@markdown For the name of the data in GCS; for generating multiple sets of file i.e. for different sequence lengths, xxx is the placeholder for each identifier (if not generating multiple files, just put the plain name of the directory here)
FINETUNING_DATA_DIR_format = "MRPC_adding_preds_only_others_xxx" #@param {type:"string"}
#@markdown whether or not this script is being run in a GCP runtime (if more memory is required for large databases)
GCP_RUNTIME = False #@param {type:"boolean"}

#@markdown Which task to perform: options are "MRPC" for paired sequence method, "MRPC_w_preds" for paired sequence method with external data, "RE" for single sequence method, or "NER" for single sequance per residue prediction (if you add more modes make sure to change the corresponding code segments)
MODE = "MRPC_w_preds" #@param {type:"string"}


#If using a GCP runtime to generate data (if database is large and more memory is needed), use these commands prior to running this notebook

To ssh into the VM:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Make sure the port above matches the port below (in this case it's 8888)

```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser

(one command):sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
And then copy and paste the outputted link with "locahost: ..." into the colab connect to local runtime option


#Clone the repo

In [None]:
if GCP_RUNTIME:
  !sudo apt-get -y install git-all
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

#Authorize for GCS and Imports

In [None]:
if not GCP_RUNTIME:
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

  %tensorflow_version 1.x
import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil
import importlib

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_classifier import MrpcProcessor,REProcessor,MrpcWithPredsProcessor  ##change this part if you add more modes--
from mutformer.run_ner_for_pathogenic import NERProcessor       ##--

##reload modules in case that's needed
modules2reload = [modeling, 
                  optimization, 
                  tokenization,
                  run_classifier,
                  run_ner_for_pathogenic]
for module in modules2reload:
    importlib.reload(module)

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create formatter and add it to the handlers
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)

##Vocabulary for the model (B and J are markers for the beginning and ending of a protein sequence)
vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''

  
with open("vocab.txt", "w") as fo:
  for token in vocab.split("\n"):
    fo.write(token+"\n")


if MODE=="MRPC": ##change this part if you added more modes
  processor = MrpcProcessor()
  script = run_classifier
elif MODE=="MRPC_w_preds":
  processor = MrpcWithPredsProcessor()
  script = run_classifier
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\", \"RE\",\"NER\"].")
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file="vocab.txt", do_lower_case=False)

#Specify Data location/Mount Drive if needed

In [None]:
if not GCP_RUNTIME:
  from google.colab import drive,auth
import os
import shutil
#@markdown input finetuning data folder (can be a GCS path if needed for large databases; cannot be a drive path if using GCP_RUNTIME): for generating multiple sets of file i.e. for different sequence lengths, xxx is the placeholder for each identifier (if not generating multiple files, just put the plain path of the directory here without xxx)
data_folder_format = "/content/drive/My Drive/BERT finetuning/MRPC/w_added_only_others_modified_bert_mrpc_512" #@param {type: "string"}
if "/content/drive" in data_folder_format:
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)


# Data Generation

###General setup and definitions

In [None]:
#@markdown maximum batch size the training script can handle without OOM (must be divisible by NUM_TPU_CORES_WHEN_TESTING)
MAX_BATCH_SIZE =  1024 #@param {type:"integer"}
#@markdown if using PRECISE_TESTING, how many tpu cores will be used during testing (for colab runtimes, it's 8)
NUM_TPU_CORES_WHEN_TESTING = 8 #@param {type:"integer"}


BUCKET_PATH = "gs://{}".format(BUCKET_NAME)

def generate_data(MAX_SEQ_LENGTH,
                  DATA_GCS_DIR,
                  ID,
                  PRECISE_TESTING,
                  USING_SHARDS,
                  SHARD_SIZE):  
  data_folder_current= data_folder_format.replace("xxx",str(ID))

  try:
    print("\nGenerating train set...\n")
    if USING_SHARDS:
      rd_rg = [0,SHARD_SIZE]
      i=0
    else:
      rd_rg = None
    while True:
      train_examples = processor.get_train_examples(data_folder_current,read_range=rd_rg)
      if len(train_examples) == 0:
        break
      train_file = os.path.join(DATA_GCS_DIR, "train.tf_record")
      if USING_SHARDS:
        train_file+="_"+str(i)
      script.file_based_convert_examples_to_features(
          train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file)
      if not USING_SHARDS:
        break
      else:
        rd_rg = [pt+SHARD_SIZE for pt in rd_rg]
        i+=1
  except Exception as e:
    print("training data generation failed. Error:",e)

  try:
    print("\nGenerating eval set...\n")
    if USING_SHARDS:
      rd_rg = [0,SHARD_SIZE]
      i=0
    else:
      rd_rg = None
    while True:
      eval_examples = processor.get_dev_examples(data_folder_current,read_range=rd_rg)
      if len(eval_examples) == 0:
        break
      eval_file = os.path.join(DATA_GCS_DIR, "eval.tf_record")
      if USING_SHARDS:
        eval_file+="_"+str(i)
      script.file_based_convert_examples_to_features(
          eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file)
      if not USING_SHARDS:
        break
      else:
        rd_rg = [pt+SHARD_SIZE for pt in rd_rg]
        i+=1
  except Exception as e:
    print("eval data generation failed. Error:",e)

  try:
    print("\nGenerating test set...\n")
    if USING_SHARDS:
      rd_rg = [0,SHARD_SIZE]
      i=0
    else:
      rd_rg = None
    while True:
      test_examples = processor.get_test_examples(data_folder_current,read_range=rd_rg)
      if len(test_examples) == 0:
        break
      test_file = os.path.join(DATA_GCS_DIR, "test.tf_record")
      if USING_SHARDS:
        test_file+="_"+str(i)
      ## if using precise testing, the data will be split into two sets: 
      ## one set will be able to be predicted on the maximum possible batch 
      ## size, while the other will be predicted on a batch size of one, to 
      ##ensure the fastest prediction without leaving out any datapoints
      if PRECISE_TESTING and len(test_examples)<SHARD_SIZE:
        test_file_trailing = os.path.join(DATA_GCS_DIR, "test_trailing.tf_record")
        def largest_mutiple_under_max(max,multiple_base):
          return int(max/multiple_base)*multiple_base

        split = largest_mutiple_under_max(len(test_examples),MAX_BATCH_SIZE)
        test_examples_head = test_examples[:split]
        test_examples_trailing = test_examples[split:]
        script.file_based_convert_examples_to_features(
            test_examples_head, label_list, MAX_SEQ_LENGTH, tokenizer, test_file)
        script.file_based_convert_examples_to_features(
            test_examples_trailing, label_list, MAX_SEQ_LENGTH, tokenizer, test_file_trailing)
      else:
        script.file_based_convert_examples_to_features(
            test_examples, label_list, MAX_SEQ_LENGTH, tokenizer, test_file)
      if not USING_SHARDS:
        break
      else:
        rd_rg = [pt+SHARD_SIZE for pt in rd_rg]
        i+=1
  except Exception as e:
    print("testing data generation failed. Error:",e)

###Varying sequence lengths

In [None]:
#@markdown list of maximum sequence lengths to generate data for
lengths = [64,128,256,512,1024] #@param
#@markdown whether or not to ensure all dataponts are predicted
PRECISE_TESTING = False #@param {type:"boolean"}


for MAX_SEQ_LENGTH in lengths:
  print("Generating data for seq length:",MAX_SEQ_LENGTH)
  DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, FINETUNING_DATA_DIR_format.replace("xxx",str(MAX_SEQ_LENGTH)))
  
  generate_data(MAX_SEQ_LENGTH,
                DATA_GCS_DIR,
                MAX_SEQ_LENGTH,
                PRECISE_TESTING,
                USING_SHARDS,
                SHARD_SIZE)
  

###Only one dataset

In [None]:
#@markdown maximum output data length (because using paired method, actual protein sequence length is half)
MAX_SEQ_LENGTH = 512 #@param {type:"integer"}
#@markdown whether or not to ensure all dataponts are predicted
PRECISE_TESTING = True #@param {type:"boolean"}
#@markdown whether or not to split the data processing into (for really large databases, since finetuning data typically isn't that large)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown if USING_SHARDS, what shard size to use (must be divisible by MAX_BATCH_SIZE)
SHARD_SIZE = 1024000 #@param {type:"integer"}

DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, FINETUNING_DATA_DIR_format.replace("xxx",str(MAX_SEQ_LENGTH)))

generate_data(MAX_SEQ_LENGTH,
              DATA_GCS_DIR,
              "",
              PRECISE_TESTING,
              USING_SHARDS,
              SHARD_SIZE)


###Varying identifiers

In [None]:
#@markdown maximum batch size the training script can handle without OOM
MAX_BATCH_SIZE =  1024 #@param {type:"integer"}
#@markdown whether or not to ensure all dataponts are predicted
PRECISE_TESTING = True #@param {type:"boolean"}
#@markdown maximum output data length (because using paired method, actual protein sequence length is half)
MAX_SEQ_LENGTH = 512 #@param {type:"integer"}
#@markdown list of identifiers to generate data for
identifiers = ["a","b","c"] #@param

for id in identifiers:
  DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, FINETUNING_DATA_DIR_format.replace("xxx",id))
  generate_data(MAX_SEQ_LENGTH,
                DATA_GCS_DIR,
                id,
                PRECISE_TESTING,
                USING_SHARDS,
                SHARD_SIZE)


