Note: Run multiple copies of this notebook in multiple VMs to generate data in parallel for multiple models

# Configure settings

In [18]:
#@markdown ###General Config
#@markdown #####Whether you are using a Google Cloud VM and Google Cloud Platform TPU; NOTE: make sure this value is the same in the training script when you run it:
USE_GCP_TPU = True #@param {type:"boolean"}
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
#@markdown #####also make sure this value is the same in the training script:
DATA_COPIES = 20 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = False #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
MODEL_ID = "modified_large" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data_1024" #@param {type:"string"}
TESTING_DIR = "testing_data_1024" #@param {type:"string"}
EVAL_DIR = "eval_data_1024" #@param {type:"string"}
VOC_FNAME = "vocab.txt" #@param {type:"string"}
#@markdown #####for miscellaneous temporary storage (make sure this value is the same in the training script)
TEMP_DIR = "modified_large_temp" #@param {type:"string"}


In [19]:
from google.colab import drive
!fusermount -u /content/drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)
DRIVE_PATH = "/content/drive/My Drive"

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


#Clone the repo

In [21]:
#@markdown ######where to clone the repo into (only value that it can't be is "mutformer"):
REPO_DESTINATION_PATH = "code/mutformer" #@param {type:"string"}
import os,shutil
if not os.path.exists(REPO_DESTINATION_PATH):
  os.makedirs(REPO_DESTINATION_PATH)
else:
  shutil.rmtree(REPO_DESTINATION_PATH)
  os.makedirs(REPO_DESTINATION_PATH)
cmd = "git clone https://tianqitheodorejiang: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@github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

Cloning into 'code/mutformer'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 79 (delta 31), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (79/79), done.


#Imports

In [22]:
%tensorflow_version 1.x
import sys
import json
import random
import logging
import tensorflow as tf
import time
import os
import shutil

print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

from glob import glob
if not USE_GCP_TPU:
  from google.colab import auth
  print("Authorize for GCS:")
  auth.authenticate_user()
  print("Authorize done")

if not os.path.exists("mutformer"):
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree(REPO_DESTINATION_PATH+"/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_pretraining import input_fn_builder, model_fn_builder

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create formatter and add it to the handlers
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)


2021-09-04 03:47:23


#Copy data from drive

In [23]:
data_folder = "BERT pretraining/mutformer_pretraining_data" #@param {type: "string"}

data_path_train = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+".txt" 
data_path_eval = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+"_eval.txt"
data_path_test = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+"_test.txt"

DATA_FPATH_train = "dataset_train.txt"
DATA_FPATH_eval = "dataset_eval.txt"
DATA_FPATH_test = "dataset_test.txt"

if os.path.exists(DATA_FPATH_train):
  os.remove(DATA_FPATH_train)
if os.path.exists(DATA_FPATH_eval):
  os.remove(DATA_FPATH_eval)
if os.path.exists(DATA_FPATH_test):
  os.remove(DATA_FPATH_test)

shutil.copy(data_path_train,DATA_FPATH_train)
shutil.copy(data_path_eval,DATA_FPATH_eval)
shutil.copy(data_path_test,DATA_FPATH_test)

'dataset_test.txt'

## Data preparation

**TO ACCESS ANY BUCKET: go to this address: https://console.cloud.google.com/storage/browser/(BUCKET_NAME)**



In [None]:
DATA_FPATH_train = "dataset_train.txt"
DATA_FPATH_eval = "dataset_eval.txt"
DATA_FPATH_test = "dataset_test.txt"

if os .path.exists("./shards_train"):
  shutil.rmtree("./shards_train")
chunk_size = int(256000/(MAX_SEQ_LENGTH/100))
!mkdir ./shards_train
!split -a 4 -l $chunk_size -d $DATA_FPATH_train ./shards_train/shard_
!ls ./shards_train/

if os .path.exists("./shards_eval"):
  shutil.rmtree("./shards_eval")
!mkdir ./shards_eval
!split -a 4 -l $chunk_size -d $DATA_FPATH_eval ./shards_eval/shard_
!ls ./shards_eval/

if os .path.exists("./shards_test"):
  shutil.rmtree("./shards_test")
!mkdir ./shards_test
!split -a 4 -l $chunk_size -d $DATA_FPATH_test ./shards_test/shard_
!ls ./shards_test/

import json
vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''



with open(VOC_FNAME, "w") as fo:
  for token in vocab.split("\n"):
    fo.write(token+"\n")


def generate_data(input_dir,dir):
  seed = random.randrange(sys.maxsize)
  train_files = ",".join([input_dir+"/"+file for file in os.listdir(input_dir)])
  out_files = ",".join([dir+"/"+file+".tfrecord" for file in os.listdir(input_dir)])
  print(train_files,out_files)
  XARGS_CMD = ("python3 mutformer/create_pretraining_data.py "
              "--input_file={} "
              "--output_file={} "
              "--vocab_file={} "
              "--do_lower_case={} "
              "--max_predictions_per_seq={} "
              "--max_seq_length={} "
              "--masked_lm_prob={} "
              "--random_seed={} "
              "--dupe_factor=1")

  XARGS_CMD = XARGS_CMD.format(train_files, out_files, 
                              VOC_FNAME, DO_LOWER_CASE, 
                              MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB,seed)
  print(XARGS_CMD)
  if os.path.exists(dir):
    shutil.rmtree(dir)
  os.mkdir(dir)

  
  !$XARGS_CMD
    

shard_0000  shard_0002	shard_0004  shard_0006
shard_0001  shard_0003	shard_0005
shard_0000
shard_0000


#Eval Data Generation

In [None]:
generate_data("./shards_eval",EVAL_DIR)
cmd="gsutil -m cp -r "+EVAL_DIR +" gs://"+BUCKET_NAME
!{cmd}

./shards_eval/shard_0000 eval_data_1024/shard_0000.tfrecord
python3 mutformer/create_pretraining_data.py --input_file=./shards_eval/shard_0000 --output_file=eval_data_1024/shard_0000.tfrecord --vocab_file=vocab.txt --do_lower_case=False --max_predictions_per_seq=20 --max_seq_length=1024 --masked_lm_prob=0.15 --random_seed=937468900717726535 --dupe_factor=1


W0904 01:01:23.954904 140608366884736 module_wrapper.py:139] From mutformer/create_pretraining_data.py:378: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0904 01:01:23.955085 140608366884736 module_wrapper.py:139] From mutformer/create_pretraining_data.py:378: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0904 01:01:23.955235 140608366884736 module_wrapper.py:139] From /content/mutformer/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0904 01:01:23.955931 140608366884736 module

#Testing Data Generation

In [None]:
generate_data("./shards_test",TESTING_DIR)
cmd="gsutil -m cp -r "+TESTING_DIR +" gs://"+BUCKET_NAME
!{cmd}

./shards_test/shard_0000 testing_data_1024/shard_0000.tfrecord
python3 mutformer/create_pretraining_data.py --input_file=./shards_test/shard_0000 --output_file=testing_data_1024/shard_0000.tfrecord --vocab_file=vocab.txt --do_lower_case=False --max_predictions_per_seq=20 --max_seq_length=1024 --masked_lm_prob=0.15 --random_seed=8632996451205767373 --dupe_factor=1


W0904 01:01:38.154224 140559563118464 module_wrapper.py:139] From mutformer/create_pretraining_data.py:378: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0904 01:01:38.154468 140559563118464 module_wrapper.py:139] From mutformer/create_pretraining_data.py:378: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0904 01:01:38.154699 140559563118464 module_wrapper.py:139] From /content/mutformer/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0904 01:01:38.155388 140559563118464

#Constant Parallel Training Data Generation

In [None]:
import time

operating_files = ["available_indexes"]

def download_tmp_files(operating_files): ##for downloading tmp files from drive or GCS
  for op_file in operating_files:
    if USE_GCP_TPU: ##If using GCP TPU, drive isn't available, so we need to store temporary files in GCS
      cmd = "gsutil -m cp -r gs://"+BUCKET_NAME+"/"+TEMP_DIR+"/"+op_file+".txt "+TEMP_DIR+"/"+op_file+".txt"
      !{cmd}
    else:
      shutil.copy(DRIVE_PATH+"/"+TEMP_DIR+"/"+op_file+".txt",TEMP_DIR+"/"+op_file+".txt")

def upload_tmp_files(operating_files): ##for uploading tmp files to drive or GCS
  for op_file in operating_files:
    if USE_GCP_TPU: ##doing the same thing as above^^
      cmd = "gsutil -m cp -r "+TEMP_DIR+"/"+op_file+".txt gs://"+BUCKET_NAME+"/"+TEMP_DIR+"/"+op_file+".txt"
      !{cmd}
    else:
      shutil.copy(TEMP_DIR+"/"+op_file+".txt",DRIVE_PATH+"/"+TEMP_DIR+"/"+op_file+".txt")

while True:
  for i in range(0,DATA_COPIES):
    run=MODEL_ID
    print("RUN:",run)
    if not os.path.exists(TEMP_DIR):
      os.mkdir(TEMP_DIR)
    else:
      shutil.rmtree(TEMP_DIR)
      os.mkdir(TEMP_DIR)

    download_tmp_files(operating_files)

    print("processing data for epoch:",i)
    if not os.path.exists(TEMP_DIR+"/available_indexes.txt"):
      available_indexes = open(TEMP_DIR+"/available_indexes.txt","w+").read().split("\n")[:-1]
    else:
      available_indexes = open(TEMP_DIR+"/available_indexes.txt").read().split("\n")[:-1]
    print("available_indexes:",available_indexes)
    if str(i) not in available_indexes:
      directory = PRETRAINING_DIR+"_"+run+"/"+str(i)
      print("writing into dir:",directory)
      generate_data("./shards_train",directory)
      cmd="gsutil -m cp -r "+PRETRAINING_DIR+"_"+run+ " gs://"+BUCKET_NAME
      !{cmd}
      open(TEMP_DIR+"/available_indexes.txt","a").write(str(i)+"\n")
      upload_tmp_files(operating_files)

  time.sleep(1200)

RUN: modified_large
CommandException: No URLs matched: gs://theodore_jiang/modified_large_temp/available_indexes.txt
CommandException: 1 file/object could not be transferred.
processing data for epoch: 0
available_indexes: []
writing into dir: pretraining_data_1024_modified_large/0
./shards_train/shard_0000,./shards_train/shard_0003,./shards_train/shard_0006,./shards_train/shard_0002,./shards_train/shard_0005,./shards_train/shard_0004,./shards_train/shard_0001 pretraining_data_1024_modified_large/0/shard_0000.tfrecord,pretraining_data_1024_modified_large/0/shard_0003.tfrecord,pretraining_data_1024_modified_large/0/shard_0006.tfrecord,pretraining_data_1024_modified_large/0/shard_0002.tfrecord,pretraining_data_1024_modified_large/0/shard_0005.tfrecord,pretraining_data_1024_modified_large/0/shard_0004.tfrecord,pretraining_data_1024_modified_large/0/shard_0001.tfrecord
python3 mutformer/create_pretraining_data.py --input_file=./shards_train/shard_0000,./shards_train/shard_0003,./shards_tra