Note: Run multiple copies of this notebook in multiple VMs to generate data in parallel for multiple models

# Configure settings

In [None]:
#@markdown ### General Config
MAX_SEQ_LENGTH =  1024#@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = False #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
MODEL_DIR = "bert_model_modified_large" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data_1024" #@param {type:"string"}
TESTING_DIR = "testing_data_1024" #@param {type:"string"}
EVAL_DIR = "eval_data_1024" #@param {type:"string"}
VOC_FNAME = "vocab.txt" #@param {type:"string"}

In [None]:
from google.colab import drive
!fusermount -u /content/drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)

fusermount: failed to unmount /content/drive: No such file or directory
Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


#Clone the repo

In [None]:
REPO_DESTINATION_PATH = "/content/drive/My Drive/mutformer" #@param {type:"string"}
cmd = "git clone https://tianqitheodorejiang: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@github.com/WGLab/mutformer.git \"" + REPO_DESTINATION_PATH + "\""
!{cmd}

fatal: destination path '/content/drive/My Drive/mutformer' already exists and is not an empty directory.


#Imports

In [None]:
%tensorflow_version 1.x
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import shutil
import time

print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

from glob import glob
from google.colab import auth, drive
from tensorflow.keras.utils import Progbar

if not os.path.exists("mutformer"):
  shutil.copytree("/content/drive/My Drive/mutformer/mutformer_model_code","mutformer")
else:
  shutil.rmtree("mutformer")
  shutil.copytree("/content/drive/My Drive/mutformer/mutformer_model_code","mutformer")
if "mutformer" in sys.path:
  sys.path.remove("mutformer")
sys.path.append("mutformer")

from mutformer import modeling, optimization, tokenization
from mutformer.modeling import BertModel,BertModelModified
from mutformer.run_pretraining import input_fn_builder, model_fn_builder


print("Authorize for GCS:")
auth.authenticate_user()
print("Authorize done")
  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create formatter and add it to the handlers
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
log.addHandler(ch)


2021-08-31 22:35:04
Authorize for GCS:
Authorize done


#Copy data from drive

In [None]:
data_folder = "BERT pretraining/data" #@param {type: "string"}

data_path_train = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+".txt" 
data_path_eval = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+"_eval.txt"
data_path_test = "/content/drive/My Drive/"+data_folder+"/sequences_"+str(MAX_SEQ_LENGTH)+"_test.txt"

DATA_FPATH_train = "dataset_train.txt"
DATA_FPATH_eval = "dataset_eval.txt"
DATA_FPATH_test = "dataset_test.txt"

if os.path.exists(DATA_FPATH_train):
  os.remove(DATA_FPATH_train)
if os.path.exists(DATA_FPATH_eval):
  os.remove(DATA_FPATH_eval)
if os.path.exists(DATA_FPATH_test):
  os.remove(DATA_FPATH_test)

shutil.copy(data_path_train,DATA_FPATH_train)
shutil.copy(data_path_eval,DATA_FPATH_eval)
shutil.copy(data_path_test,DATA_FPATH_test)

'dataset_test.txt'

## Data preparation

**TO ACCESS ANY BUCKET: go to this address: https://console.cloud.google.com/storage/browser/(BUCKET_NAME)**



In [None]:
DATA_FPATH_train = "dataset_train.txt"
DATA_FPATH_eval = "dataset_eval.txt"
DATA_FPATH_test = "dataset_test.txt"

if os .path.exists("./shards_train"):
  shutil.rmtree("./shards_train")
chunk_size = int(256000/(MAX_SEQ_LENGTH/100))
!mkdir ./shards_train
!split -a 4 -l $chunk_size -d $DATA_FPATH_train ./shards_train/shard_
!ls ./shards_train/

if os .path.exists("./shards_eval"):
  shutil.rmtree("./shards_eval")
!mkdir ./shards_eval
!split -a 4 -l $chunk_size -d $DATA_FPATH_eval ./shards_eval/shard_
!ls ./shards_eval/

if os .path.exists("./shards_test"):
  shutil.rmtree("./shards_test")
!mkdir ./shards_test
!split -a 4 -l $chunk_size -d $DATA_FPATH_test ./shards_test/shard_
!ls ./shards_test/

import json
vocab = \
'''[PAD]
[UNK]
[CLS]
[SEP]
[MASK]
L
S
B
J
E
A
P
T
G
V
K
R
D
Q
I
N
F
H
Y
C
M
W'''
  
if not os.path.exists(MODEL_DIR):
  os.mkdir(MODEL_DIR)

with open("{}/{}".format(MODEL_DIR, VOC_FNAME), "w") as fo:
  for token in vocab.split("\n"):
    fo.write(token+"\n")


def generate_data(input_dir,dir):
  seed = random.randrange(sys.maxsize)
  train_files = ",".join([input_dir+"/"+file for file in os.listdir(input_dir)])
  out_files = ",".join([dir+"/"+file+".tfrecord" for file in os.listdir(input_dir)])
  print(train_files,out_files)
  XARGS_CMD = ("python3 bert/create_pretraining_data.py "
              "--input_file={} "
              "--output_file={} "
              "--vocab_file={} "
              "--do_lower_case={} "
              "--max_predictions_per_seq={} "
              "--max_seq_length={} "
              "--masked_lm_prob={} "
              "--random_seed={} "
              "--dupe_factor=1")

  XARGS_CMD = XARGS_CMD.format(train_files, out_files, 
                              os.path.join(MODEL_DIR,VOC_FNAME), DO_LOWER_CASE, 
                              MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB,seed)
  print(XARGS_CMD)
  if os.path.exists(dir):
    shutil.rmtree(dir)
  os.mkdir(dir)

  
  !$XARGS_CMD
    

shard_0000  shard_0002	shard_0004  shard_0006
shard_0001  shard_0003	shard_0005
shard_0000
shard_0000


#Eval Data Generation

In [None]:
generate_data("./shards_eval",EVAL_DIR)
cmd="gsutil -m cp -r "+EVAL_DIR +" gs://"+BUCKET_NAME
!{cmd}

./shards_eval/shard_0000 eval_data_1024/shard_0000.tfrecord
python3 bert/create_pretraining_data.py --input_file=./shards_eval/shard_0000 --output_file=eval_data_1024/shard_0000.tfrecord --vocab_file=bert_model_modified_large/vocab.txt --do_lower_case=False --max_predictions_per_seq=20 --max_seq_length=1024 --masked_lm_prob=0.15 --random_seed=4092375970891346155 --dupe_factor=1


W0831 22:36:09.682124 140152098056064 module_wrapper.py:139] From bert/create_pretraining_data.py:378: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0831 22:36:09.682308 140152098056064 module_wrapper.py:139] From bert/create_pretraining_data.py:378: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0831 22:36:09.682471 140152098056064 module_wrapper.py:139] From /content/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0831 22:36:09.683685 140152098056064

#Testing Data Generation

In [None]:
generate_data("./shards_test",TESTING_DIR)
cmd="gsutil -m cp -r "+TESTING_DIR +" gs://"+BUCKET_NAME
!{cmd}

./shards_test/shard_0000 testing_data_1024/shard_0000.tfrecord
python3 bert/create_pretraining_data.py --input_file=./shards_test/shard_0000 --output_file=testing_data_1024/shard_0000.tfrecord --vocab_file=bert_model_modified_large/vocab.txt --do_lower_case=False --max_predictions_per_seq=20 --max_seq_length=1024 --masked_lm_prob=0.15 --random_seed=878342669880606997 --dupe_factor=1


W0831 22:36:24.619751 140200509106048 module_wrapper.py:139] From bert/create_pretraining_data.py:378: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0831 22:36:24.620009 140200509106048 module_wrapper.py:139] From bert/create_pretraining_data.py:378: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0831 22:36:24.620231 140200509106048 module_wrapper.py:139] From /content/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0831 22:36:24.620923 1402005091

#Constant Parallel Training Data Generation

In [None]:
import time
runs = ["modified_large"]

while True:
  for i in range(0,20): ##20 is the chosen bin size, it is the maximum number of copies of data that will be kept at any point in time
    for run in runs:
      print("RUN:",run)
      if not os.path.exists(PRETRAINING_DIR+ "_"+ run):
        os.mkdir(PRETRAINING_DIR+ "_"+run)
      else:
        shutil.rmtree(PRETRAINING_DIR+ "_"+run)
        os.mkdir(PRETRAINING_DIR+ "_"+run)

      print("processing data for epoch:",i)
      if not os.path.exists("/content/drive/My Drive/"+PRETRAINING_DIR+"_"+run+"_available_indexes.txt"):
        available_indexes = open("/content/drive/My Drive/"+PRETRAINING_DIR+"_"+run+"_available_indexes.txt","w+").read().split("\n")[:-1]
      else:
        available_indexes = open("/content/drive/My Drive/"+PRETRAINING_DIR+"_"+run+"_available_indexes.txt").read().split("\n")[:-1]
      print("available_indexes:",available_indexes)
      if str(i) not in available_indexes:
        directory = PRETRAINING_DIR+"_"+run+"/"+str(i)
        print("writing into dir:",directory)
        generate_data("./shards_train",directory)
        cmd="gsutil -m cp -r "+PRETRAINING_DIR+"_"+run+ " gs://"+BUCKET_NAME
        !{cmd}
        open("/content/drive/My Drive/"+PRETRAINING_DIR+"_"+run+"_available_indexes.txt","a").write(str(i)+"\n")
  time.sleep(1200)

RUN: modified_large
processing data for epoch: 0
available_indexes: ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '5', '6', '7', '8', '9', '3', '4', '2', '0', '1']
RUN: modified_large
processing data for epoch: 1
available_indexes: ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '5', '6', '7', '8', '9', '3', '4', '2', '0', '1']
RUN: modified_large
processing data for epoch: 2
available_indexes: ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '5', '6', '7', '8', '9', '3', '4', '2', '0', '1']
RUN: modified_large
processing data for epoch: 3
available_indexes: ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '5', '6', '7', '8', '9', '3', '4', '2', '0', '1']
RUN: modified_large
processing data for epoch: 4
available_indexes: ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '5', '6', '7', '8', '9', '3', '4', '2', '0', '1']
RUN: modified_large
processing data for epoch: 5
available_indexes: ['10', '11', '12', '13', '14', '15', 