#Finetuning Data Generation Script

This notebook processes tsv data and uploads the processed data to GCS to be used for finetuning MutFormer.

# Configure settings/Mount Drive if needed

In [None]:
#@markdown ## General Config
#@markdown Whether or not this script is being run in a GCP runtime (if more memory is required for large databases)
GCP_RUNTIME = False #@param {type:"boolean"}
#@markdown Which mode to use (a different mode means a different finetuning task): options are:
#@markdown * "MRPC" - paired sequence method
#@markdown * "MRPC_w_ex_data" - paired sequence method with external data
#@markdown * "RE" - single sequence method
#@markdown * "NER" - single sequence per residue prediction
#@markdown 
#@markdown You can add more modes by creating a new processor and/or a new model_fn inside of the "mutformer_model_code" folder downloaded from github, then changing the corresponding code snippets in the code segment named "Authorize for GCS, Imports, and General Setup" (also edit the dropdown below).
MODE = "MRPC" #@param   ["MRPC_w_ex_data", "MRPC", "RE", "NER"]   {type:"string"} 
            ####      ^^^^^ dropdown list for all modes ^^^^^

#@markdown Name of the GCS bucket to use:
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://"+BUCKET_NAME
#@markdown \
#@markdown 
#@markdown 
#@markdown ## IO Config
#@markdown Input finetuning data folder: data will be read from here to be processed and uploaded to GCS (can be a drive path, or a GCS path if needed for large databases; must be a GCS path if using GCP_RUNTIME):
#@markdown 
#@markdown * For processing multiple sets i.e. for multiple sequence lengths, simply store these sets into separate subfolders inside of the folder listed below, with each subfolder being named as specified in the following section.
#@markdown 
#@markdown * For processing a single set, this folder should directly contain one dataset. 
#@markdown
INPUT_DATA_DIR = "gs://theodore_jiang/finetune_updated_data/MRPC_wo_preds" #@param {type: "string"}


if not GCP_RUNTIME:                    ##if INPUT_DATA_DIR is a drive path,
  if "/content/drive" in INPUT_DATA_DIR:   ##mount google drive
    from google.colab import drive
    if GCP_RUNTIME:
      raise Exception("if GCP_RUNTIME, a GCS path must be used, since Google's cloud TPUs can only communicate with GCS and not drive")
    !fusermount -u /content/drive
    drive.flush_and_unmount()
    drive.mount('/content/drive', force_remount=True)


#@markdown Name of the folder in GCS to put processed data into: 
#@markdown * For generating multiple datasets i.e. for different sequence lengths, they will be written as individual subfolders inside of this folder.
OUTPUT_DATA_DIR = "MRPC_finetune_update_loaded" #@param {type:"string"}


DATA_INFO = {      ##dictionary that will be uploaded alongside 
    "mode":MODE    ##each dataset to indicate its parameters
}


#### Vocabulary for the model (MutFormer uses the vocabulary below) ([PAD]
#### [UNK],[CLS],[SEP], and [MASK] are necessary default tokens; B and J
#### are markers for the beginning and ending of a protein sequence,
#### respectively; the rest are all amino acids possible, ranked 
#### approximately by frequency of occurence in human population)
#### vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''
with open("vocab.txt", "w") as fo:
  for token in vocab.split("\n"):
    fo.write(token+"\n")


#If running on a GCP runtime, follow these instructions to set it up:

###1) Create a VM from the GCP website
###2) Open a command prompt on your computer and perform the following steps"
To ssh into the VM, run:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Note: Make sure the port above matches the port below (in this case it's 8888)
\
\
In the new command prompt that popped out, either run each of the commands below individually, or copy and paste the one liner below:
```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
One command:
```
sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
###3) In this notebook, click the "connect to local runtime" option under the connect button, and copy and paste the link outputted by command prompt with "locahost: ..."

#Clone the MutFormer repo

In [None]:
if GCP_RUNTIME:
  !sudo apt-get -y install git-all
#@markdown Where to clone the repo into:
REPO_DESTINATION_PATH = "mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

Cloning into 'mutformer'...
remote: Enumerating objects: 614, done.[K
remote: Counting objects: 100% (415/415), done.[K
remote: Compressing objects: 100% (335/335), done.[K
remote: Total 614 (delta 299), reused 111 (delta 78), pack-reused 199[K
Receiving objects: 100% (614/614), 2.14 MiB | 7.06 MiB/s, done.
Resolving deltas: 100% (410/410), done.


#Authorize for GCS, Imports, and General Setup

In [None]:
if not GCP_RUNTIME:
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

  %tensorflow_version 1.x
import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil
import importlib

if REPO_DESTINATION_PATH == "mutformer":
  if os.path.exists("mutformer_code"):
    shutil.rmtree("mutformer_code")
  shutil.copytree(REPO_DESTINATION_PATH,"mutformer_code")
  REPO_DESTINATION_PATH = "mutformer_code"
if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization,run_classifier,run_ner_for_pathogenic  #### <<<<< if you added more modes, change these imports to import the correct processors         
from mutformer.run_classifier import MrpcProcessor,REProcessor,MrpcWithExDataProcessor            #### <<<<< and correct training scripts (i.e. run_classifier and run_ner_for_pathogenic)
from mutformer.run_ner_for_pathogenic import NERProcessor                                

##reload modules so that you don't need to restart the runtime to reload modules in case that's needed
modules2reload = [modeling, 
                  optimization, 
                  tokenization,
                  run_classifier,
                  run_ner_for_pathogenic]
for module in modules2reload:
    importlib.reload(module)

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

log.handlers = []

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)


if MODE=="MRPC":      ####       vvvvv if you added more modes, change this part to set the processors and training scripts correctly vvvvv
  processor = MrpcProcessor()
  script = run_classifier
  USE_EX_DATA = False
elif MODE=="MRPC_w_ex_data":
  processor = MrpcWithExDataProcessor()
  script = run_classifier
  USE_EX_DATA = True
elif MODE=="RE":
  processor = REProcessor()
  script = run_classifier
  USE_EX_DATA = False
elif MODE=="NER":
  processor = NERProcessor()
  script = run_ner_for_pathogenic
  USE_EX_DATA = False
else:
  raise Exception("The mode specified was not one of the available modes: [\"MRPC\", \"RE\",\"NER\"].")
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file="vocab.txt", do_lower_case=False)
                      ####       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Authorize for GCS:
Authorize done


# Data Generation

###General setup and definitions

In [None]:
#@markdown Maximum batch size the finetuning_benchmark script can handle without OOM (must be divisible by NUM_TPU_CORES_WHEN_TESTING):
MAX_BATCH_SIZE =  1024 #@param {type:"integer"}
#@markdown How many tpu cores will be used during evaluation and prediction (for colab runtimes, it's 8):
NUM_TPU_CORES_WHEN_TESTING = 8 #@param {type:"integer"}

def generate_data(MAX_SEQ_LENGTH,
                  data_folder_current,
                  DATA_GCS_DIR,
                  PRECISE_TESTING,
                  USING_SHARDS,
                  SHARD_SIZE):  

  try:
    print("\nUpdating and uploading data info json...\n")
    DATA_INFO["sequence_length"] = MAX_SEQ_LENGTH    ##update data info with sequence length

    if USE_EX_DATA:                         ##if using external data, update data 
      def get_ex_data_num(file):            ##info with the # of external datapoints being used
        ex_data = tf.gfile.Open(file).read().split("\n")[0].split("\t")[3].split()
        return len(ex_data)
      DATA_INFO["ex_data_num"] = get_ex_data_num(data_folder_current+"/"+\
                                                 [file for file in tf.io.gfile.listdir(data_folder_current) 
                                                 if file.endswith(".tsv")][0])            ##just get the first tsv in the folder
    
    with tf.gfile.Open(DATA_GCS_DIR+"/info.json","w+") as out: ##writes out a dictionary containing
      json.dump(DATA_INFO,out,indent=2)                           ##the dataset's parameters
    print("Data info json uploaded successfully")
  except Exception as e:
    print("could not update and upload data info json. Error:",e)


  try:
    print("\nGenerating train set...\n")
    if USING_SHARDS:
      rd_rg = [0,SHARD_SIZE]
      i=0
    else:
      rd_rg = None
    while True:
      train_examples = processor.get_train_examples(data_folder_current,read_range=rd_rg)
      if len(train_examples) == 0:
        break
      train_file = os.path.join(DATA_GCS_DIR, "train.tf_record")
      if USING_SHARDS:
        train_file+="_"+str(i)
      script.file_based_convert_examples_to_features(
          train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file)
      if not USING_SHARDS:
        break
      else:
        rd_rg = [pt+SHARD_SIZE for pt in rd_rg]
        i+=1
  except Exception as e:
    print("training data generation failed. Error:",e)

  try:
    print("\nGenerating eval set...\n")
    if USING_SHARDS:
      rd_rg = [0,SHARD_SIZE]
      i=0
    else:
      rd_rg = None
    while True:
      eval_examples = processor.get_dev_examples(data_folder_current,read_range=rd_rg)
      if len(eval_examples) == 0:
        break
      eval_file = os.path.join(DATA_GCS_DIR, "eval.tf_record")
      if USING_SHARDS:
        eval_file+="_"+str(i)
      script.file_based_convert_examples_to_features(
          eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file)
      if not USING_SHARDS:
        break
      else:
        rd_rg = [pt+SHARD_SIZE for pt in rd_rg]
        i+=1
  except Exception as e:
    print("eval data generation failed. Error:",e)

  try:
    print("\nGenerating test set...\n")
    if USING_SHARDS:
      rd_rg = [0,SHARD_SIZE]
      i=0
    else:
      rd_rg = None
    while True:
      test_examples = processor.get_test_examples(data_folder_current,read_range=rd_rg)
      if len(test_examples) == 0:
        break
      test_file = os.path.join(DATA_GCS_DIR, "test.tf_record")
      if USING_SHARDS:
        test_file+="_"+str(i)
      ## if using precise testing, the data will be split into two sets: 
      ## one set will be able to be predicted on the maximum possible batch 
      ## size, while the other will be predicted on a batch size of one, to 
      ##ensure the fastest prediction without leaving out any datapoints
      if PRECISE_TESTING and len(test_examples)<SHARD_SIZE:
        test_file_trailing = os.path.join(DATA_GCS_DIR, "test_trailing.tf_record")
        def largest_mutiple_under_max(max,multiple_base):
          return int(max/multiple_base)*multiple_base

        split = largest_mutiple_under_max(len(test_examples),MAX_BATCH_SIZE)
        test_examples_head = test_examples[:split]
        test_examples_trailing = test_examples[split:]
        script.file_based_convert_examples_to_features(
            test_examples_head, label_list, MAX_SEQ_LENGTH, tokenizer, test_file)
        script.file_based_convert_examples_to_features(
            test_examples_trailing, label_list, MAX_SEQ_LENGTH, tokenizer, test_file_trailing)
      else:
        script.file_based_convert_examples_to_features(
            test_examples, label_list, MAX_SEQ_LENGTH, tokenizer, test_file)
      if not USING_SHARDS:
        break
      else:
        rd_rg = [pt+SHARD_SIZE for pt in rd_rg]
        i+=1
  except Exception as e:
    print("testing data generation failed. Error:",e)




###Generation ops

There are currently two data generations ops (more can be added):
1. Varying sequence lengths: multiple sets of different sequence lengths will be generated
  * Store multiple individual datasets as subfolders inside of Input finetuning data folder, with each folder named its corresponding sequence length.
2. Only one dataset: a single dataset with a specified set of parameters will be generated 
  * Directly store only the files train.tsv, dev.tsv, and test.tsv for one dataset inside Input finetuning data folder

####Varying sequence lengths

In [None]:
#@markdown List of maximum sequence lengths to generate data for
MAX_SEQ_LENGTHS = [1024,512,256,128,64] #@param
#@markdown Whether or not to ensure all datapoints are used during prediction by using an extra trailing test dataset so no datapoints will be skipped due to the batch size. (This option should be used unless an extra trailing test dataset is a large problem)
PRECISE_TESTING = True #@param {type:"boolean"}
#@markdown Whether or not to split the data processing into (for really large databases, since finetuning data typically isn't that large)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown If USING_SHARDS, what shard size to use (how many lines/datapoints should be in each shard) (MUST BE DIVISIBLE BY "MAX_BATCH_SIZE")
SHARD_SIZE = 1024000 #@param {type:"integer"}

for MAX_SEQ_LENGTH in MAX_SEQ_LENGTHS:
  print("\n\nGenerating data for seq length:",MAX_SEQ_LENGTH,"\n\n")
  DATA_GCS_DIR = BUCKET_PATH+"/"+OUTPUT_DATA_DIR +"/"+ str(MAX_SEQ_LENGTH)
  data_folder_current= INPUT_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)

  generate_data(MAX_SEQ_LENGTH,
                data_folder_current,
                DATA_GCS_DIR,
                PRECISE_TESTING,
                USING_SHARDS,
                SHARD_SIZE)
  

###Only one dataset

In [None]:
#@markdown Maximum output data length (when using paired method, actual protein sequence length is about half of this value):
MAX_SEQ_LENGTH = 512 #@param {type:"integer"}
#@markdown Whether or not to ensure all datapoints are used during prediction by using an extra trailing test dataset so no datapoints will be skipped due to the batch size. (This option should be used most of the time unless an extra trailing test dataset is a large problem)
PRECISE_TESTING = True #@param {type:"boolean"}
#@markdown Whether or not to split the data processing into (for really large databases, since finetuning data typically isn't that large)
USING_SHARDS = False #@param {type:"boolean"}
#@markdown If USING_SHARDS, what shard size to use (how many lines/datapoints should be in each shard) (MUST BE DIVISIBLE BY "MAX_BATCH_SIZE")
SHARD_SIZE = 1024000 #@param {type:"integer"}

DATA_GCS_DIR = BUCKET_PATH+"/"+OUTPUT_DATA_DIR+"/"+str(MAX_SEQ_LENGTH)
data_folder_current = INPUT_DATA_DIR

generate_data(MAX_SEQ_LENGTH,
              data_folder_current,
              DATA_GCS_DIR,
              PRECISE_TESTING,
              USING_SHARDS,
              SHARD_SIZE)
