Note: Run multiple copies of this notebook in multiple VMs to generate data in parallel for multiple models

**Note: TO ACCESS ANY BUCKET YOU HAVE PERMISSION TO VIEW: go to this address: https://console.cloud.google.com/storage/browser/(BUCKET_NAME)**



# Configure settings

In [None]:
#@markdown ###General Config
#@markdown #####Whether you are using a Google Cloud VM and Google Cloud Platform TPU; NOTE: make sure this value is the same in the training script when you run it:
USE_GCP_TPU = True #@param {type:"boolean"}
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
#@markdown #####also make sure this value is the same in the training script:
DATA_COPIES = 20 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = False #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
MODEL_ID = "modified_large" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data_1024" #@param {type:"string"}
TESTING_DIR = "testing_data_1024" #@param {type:"string"}
EVAL_DIR = "eval_data_1024" #@param {type:"string"}
#@markdown #####for miscellaneous temporary storage (make sure this value is the same in the training script)
TEMP_DIR = "modified_large_temp" #@param {type:"string"}
#@markdown whether or not this script is being run in a GCP runtime (if more memory is required for large databases)
GCP_RUNTIME = False #@param {type:"boolean"}

##Vocabulary for the model (B and J are markers for the beginning and ending of a protein sequence)
vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''


with open("vocab.txt", "w") as fo:
  for token in vocab.split("\n"):
    fo.write(token+"\n")

#If using a GCP runtime to generate data (if database is large and more memory is needed), use these commands prior to running this notebook

To ssh into the VM:

```
gcloud beta compute ssh --zone <COMPUTE ZONE> <VM NAME> --project <PROJECT NAME> -- -L 8888:localhost:8888
```

Make sure the port above matches the port below (in this case it's 8888)

```
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo apt-get install pkg-config
sudo apt-get install libhdf5-serial-dev
sudo apt-get install libffi6 libffi-dev
sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm
sudo -H pip3 install jupyter_http_over_ws
jupyter serverextension enable --py jupyter_http_over_ws
jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser

(one command):sudo apt-get update ; sudo apt-get -y install python3 python3-pip ; sudo apt-get install pkg-config ; sudo apt-get -y install libhdf5-serial-dev ; sudo apt-get install libffi6 libffi-dev; sudo -H pip3 install jupyter tensorflow==1.14 google-api-python-client tqdm ; sudo -H pip3 install jupyter_http_over_ws ; jupyter serverextension enable --py jupyter_http_over_ws ; jupyter notebook   --NotebookApp.allow_origin='https://colab.research.google.com'   --port=8888   --NotebookApp.port_retries=0   --no-browser
```
And then copy and paste the outputted link with "locahost: ..." into the colab connect to local runtime option


#Clone the repo

In [None]:
if GCP_RUNTIME:
  !sudo apt-get -y install git
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

#Imports/Authenticate for GCP

In [None]:
if not GCP_RUNTIME:
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

  %tensorflow_version 1.x

import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil
from google.colab import auth

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_pretraining import input_fn_builder, model_fn_builder

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create formatter and add it to the handlers
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)


#Specify Data location/Mount Drive if needed

In [None]:
if not GCP_RUNTIME:
  from google.colab import drive,auth
import os
import shutil
#@markdown input data folder (can be GCS for large databases)
data_folder = "gs://theodore_jiang/gcs_pretraining_data" #@param {type: "string"}
if "/content/drive" in data_folder:
  !fusermount -u /content/drive
  drive.flush_and_unmount()
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH = "/content/drive/My Drive"

## Data preparation (make shards)

In [None]:
#@markdown if using a data from google drive, size of a single chunk/shard of data (in terms of lines/datatpoints)
chunk_size_gd = 256000 #@param {type:"number"}
#@markdown if using a data from GCS, size of a single chunk/shard of data (in terms of bytes)
chunk_size_gcs =  50e6 #@param {type:"number"}
chunk_size_gcs = int(chunk_size_gcs)

if "gs://" in data_folder:
  def make_shards(dataset):
    print("Generating shards for "+dataset+":\n")
    if os .path.exists("./shards_"+dataset+""): ##data will be written as shards to prevent one single files from getting too large
      shutil.rmtree("./shards_"+dataset+"")
    cmd = "mkdir ./shards_"+dataset
    print("removing existing data if it exists...")
    cmd = "gsutil -m rm -r "+data_folder+"/shards_"+dataset+""
    !{cmd}
    start = 0
    end = chunk_size_gcs
    previous_truncated = ""
    i=0
    while True:
      print("Processing shard "+str(i))
      ##download the selected portion of the input file
      cmd = "gsutil cat -r "+str(start)+"-"+str(end)+" "+data_folder+"/"+dataset+".txt"+" | gsutil -q cp - ./shards_"+dataset+"/shard_tmp"
      !{cmd}
      ##get the line count
      cmd = "wc -l <./shards_"+dataset+"/shard_tmp"
      line_count = !{cmd}
      line_count = int(line_count[0])
      
      ##get the actual byte count
      cmd = "wc -c <./shards_"+dataset+"/shard_tmp"
      byte_count = !{cmd}
      byte_count = int(byte_count[0])
      if line_count == 0:
        print("finished after processing "+str(i)+" shards... appending the last truncated line to the end and continuing\n\n")
        i-=1
        break
      print("processing",line_count,"lines...")
      ##get the last few lines of the downloaded file
      cmd = "dd  if=./shards_"+dataset+"/shard_tmp ibs=1 skip="+str(byte_count-MAX_SEQ_LENGTH*2)+" count="+str(MAX_SEQ_LENGTH*2)+" status=none > previous_tcd.txt"
      !{cmd}
      ##truncate off the last line
      cmd = "sed -ni \'"+str(1)+","+str(line_count)+"p;"+str(line_count)+"q\' ./shards_"+dataset+"/shard_tmp"
      !{cmd}

      ##add the previously truncated line to the front of the file
      cmd = "sed -i \'1s/^/"+previous_truncated+" /\' ./shards_"+dataset+"/shard_tmp >garbage.txt"
      !{cmd}
      ##get the last line, which just got truncated, but will be added to the front of the next shard
      previous_truncated = open("previous_tcd.txt").read().split("\n")[-1]
      ##copy data to GCS
      print("Uploading to GCS...\n")
      cmd = "gsutil -q cp ./shards_"+dataset+"/shard_tmp "+data_folder+"/shards_"+dataset+"/shard_"+str(i)
      !{cmd}

      start+=chunk_size_gcs
      end+=chunk_size_gcs
      i+=1
    ##appending the last truncated line to the end of the last file
    cmd = "gsutil -q cp "+data_folder+"/shards_"+dataset+"/shard_"+str(i)+" ./shards_"+dataset+"/shard_tmp"
    !{cmd}
    with open("./shards_"+dataset+"/shard_tmp","a") as writer:
      writer.write(previous_truncated)
    cmd = "gsutil cp ./shards_"+dataset+"/shard_tmp "+data_folder+"/shards_"+dataset+"/shard_"+str(i)
    !{cmd}
    return data_folder+"/shards_"+dataset
  input_train_dir = make_shards("train")
  input_eval_dir = make_shards("eval")
  input_test_dir = make_shards("test")

else:
  DATA_FPATH_train = data_folder+"/train.txt" 
  DATA_FPATH_eval = data_folder+"/eval.txt"
  DATA_FPATH_test = data_folder+"/test.txt"

  !split -a 4 -l $chunk_size_gd -d $DATA_FPATH_train ./shards_train/shard_
  !ls ./shards_train/"
  input_train_dir = "./shards_train"

  if os .path.exists("./shards_eval"):
    shutil.rmtree("./shards_eval")
  !mkdir ./shards_eval
  !split -a 4 -l $chunk_size_gd -d $DATA_FPATH_eval ./shards_eval/shard_
  !ls ./shards_eval/
  input_eval_dir = "./shards_eval"

  if os .path.exists("./shards_test"):
    shutil.rmtree("./shards_test")
  !mkdir ./shards_test
  !split -a 4 -l $chunk_size_gd -d $DATA_FPATH_test ./shards_test/shard_
  !ls ./shards_test/
  input_test_dir = "./shards_test"

def generate_data(input_dir,dir):
  seed = random.randrange(sys.maxsize)
  input_files = ",".join([input_dir+"/"+file for file in tf.io.gfile.listdir(input_dir)])
  out_files = ",".join([dir+"/"+file+".tfrecord" for file in tf.io.gfile.listdir(input_dir)])
  print("input_files:",input_files,"output_files:",out_files)
  XARGS_CMD = ("python3 mutformer/create_pretraining_data.py "
              "--input_file={} "
              "--output_file={} "
              "--vocab_file=vocab.txt "
              "--do_lower_case={} "
              "--max_predictions_per_seq={} "
              "--max_seq_length={} "
              "--masked_lm_prob={} "
              "--random_seed={} "
              "--dupe_factor=1")

  XARGS_CMD = XARGS_CMD.format(input_files, out_files, 
                              DO_LOWER_CASE, 
                              MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB,seed)
  if os.path.exists(dir):
    shutil.rmtree(dir)
  os.mkdir(dir)
  
  !$XARGS_CMD
    

#Eval Data Generation

In [None]:
generate_data(input_eval_dir,EVAL_DIR)
cmd="gsutil -m cp -r "+EVAL_DIR +" gs://"+BUCKET_NAME
!{cmd}

#Testing Data Generation

In [None]:
generate_data(input_eval_dir,TESTING_DIR)
cmd="gsutil -m cp -r "+TESTING_DIR +" gs://"+BUCKET_NAME
!{cmd}

#Constant Parallel Training Data Generation (for dynamic masking)

In [None]:
import time

operating_files = ["available_indexes"]

def download_tmp_files(operating_files): ##for downloading tmp files from drive or GCS
  if not os.path.exists(TEMP_DIR):
    os.mkdir(TEMP_DIR)
  else:
    shutil.rmtree(TEMP_DIR)
    os.mkdir(TEMP_DIR)

  for op_file in operating_files:
    if USE_GCP_TPU: ##If using GCP TPU, drive isn't available, so we need to store temporary files in GCS
      cmd = "gsutil -m cp -r gs://"+BUCKET_NAME+"/"+TEMP_DIR+"/"+op_file+".txt "+TEMP_DIR+"/"+op_file+".txt"
      !{cmd}
    else:
      shutil.copy(DRIVE_PATH+"/"+TEMP_DIR+"/"+op_file+".txt",TEMP_DIR+"/"+op_file+".txt")

def upload_tmp_files(operating_files): ##for uploading tmp files to drive or GCS
  for op_file in operating_files:
    if USE_GCP_TPU: ##doing the same thing as above^^
      cmd = "gsutil -m cp -r "+TEMP_DIR+"/"+op_file+".txt gs://"+BUCKET_NAME+"/"+TEMP_DIR+"/"+op_file+".txt"
      !{cmd}
    else:
      shutil.copy(TEMP_DIR+"/"+op_file+".txt",DRIVE_PATH+"/"+TEMP_DIR+"/"+op_file+".txt")

while True:
  for i in range(0,DATA_COPIES):
    run=MODEL_ID
    print("RUN:",run)
    if not os.path.exists(PRETRAINING_DIR+ "_"+ run):
      os.mkdir(PRETRAINING_DIR+ "_"+run)
    else:
      shutil.rmtree(PRETRAINING_DIR+ "_"+run)
      os.mkdir(PRETRAINING_DIR+ "_"+run)
      
    download_tmp_files(operating_files)

    print("processing data for epoch:",i)
    if not os.path.exists(TEMP_DIR+"/available_indexes.txt"): ##checking the tmp files to see which data bins have already been trained on and need to be replaced
      available_indexes = open(TEMP_DIR+"/available_indexes.txt","w+").read().split("\n")[:-1]
    else:
      available_indexes = open(TEMP_DIR+"/available_indexes.txt").read().split("\n")[:-1]
    print("available_indexes:",available_indexes)
    if str(i) not in available_indexes:
      directory = PRETRAINING_DIR+"_"+run+"/"+str(i)
      print("writing into dir:",directory)
      generate_data(input_train_dir,directory)
      cmd="gsutil -m cp -r "+PRETRAINING_DIR+"_"+run+ " gs://"+BUCKET_NAME
      !{cmd}
      open(TEMP_DIR+"/available_indexes.txt","a").write(str(i)+"\n")
      upload_tmp_files(operating_files)

  time.sleep(1200)