In [1]:
%sh ls /dbfs/mnt

In [2]:
# Mount drives
dbutils.fs.mount(
  source = "wasbs://configandvocab@ktbrdsdevstorage.blob.core.windows.net",
  mount_point = "/mnt/configandvocab",
  extra_configs = {"fs.azure.account.key.ktbrdsdevstorage.blob.core.windows.net":dbutils.secrets.get(scope = "bert_pt_databricks_scope", key = "bertptkey")})

dbutils.fs.mount(
  source = "wasbs://modelweights@ktbrdsdevstorage.blob.core.windows.net",
  mount_point = "/mnt/modelweights",
  extra_configs = {"fs.azure.account.key.ktbrdsdevstorage.blob.core.windows.net":dbutils.secrets.get(scope = "bert_pt_databricks_scope", key = "bertptkey")})

dbutils.fs.mount(
  source = "wasbs://pretrainingbasedata@ktbrdsdevstorage.blob.core.windows.net",
  mount_point = "/mnt/pretrainingbasedata",
  extra_configs = {"fs.azure.account.key.ktbrdsdevstorage.blob.core.windows.net":dbutils.secrets.get(scope = "bert_pt_databricks_scope", key = "bertptkey")})

%sh ls /dbfs/mnt

In [3]:
%sh git clone https://Usherwood:DN6a1q9f@github.com/Usherwood/bert_pt_azure.git

In [4]:
%sh ls -alh /databricks/driver/bert_pt_azure/

In [5]:
import sys
import os

# Add the path to system, local or mounted S3 bucket, e.g. /dbfs/mnt/<path_to_bucket>
sys.path.append('/databricks/driver/bert_pt_azure/bert')

In [6]:
%sh python --version

In [7]:
%sh pip uninstall tensorflow

In [8]:
import os
import sys
import numpy as np
import json
import nltk
import pandas as pd
import csv
import random
import logging
import tensorflow as tf
from collections import Counter
import pathlib
import pickle

import modeling, optimization, tokenization
from run_pretraining import input_fn_builder, model_fn_builder

from text_preprocessing import tokenizer_word
from language_model_processing import read_raw_data_preprocess_and_save, create_vocab_df
from bpe import create_token_vocabulary, get_stats, merge_vocab, Encoder

In [9]:
language_maps_dir = '/dbfs/mnt/configandvocab'

def save_obj(obj, directory, name):
    with open(directory / "{}.pkl".format(name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name, directory):
    with open(os.path.join(directory, name + '.pkl'), 'rb') as f:
        return pickle.load(f)
      
      
vocab_to_id = load_obj('vocab_to_id', str(language_maps_dir))
len(vocab_to_id)

In [10]:
import modeling, optimization, tokenization

testcase = "Olá isso é mais uma BAGUNCA 😂😂😂"
bert_tokenizer = tokenization.FullTokenizer(language_maps_dir)
print(testcase)
print(bert_tokenizer.tokenize(testcase))

In [11]:
import json

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 8, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": len(vocab_to_id)
}

with open(os.path.join(language_maps_dir, 'bert_config.json'), 'w') as f:
    json.dump(bert_base_config, f)
    
print(bert_base_config)
####################################load_vocab

In [12]:
os.listdir('/dbfs/tmp/model')

In [13]:
os.rename('/dbfs/tmp/model/checkpoint.tmp2feb8d7a932249e7ba1a11f96d3cb334', '/dbfs/tmp/model/checkpoint')

In [14]:
import shutil

dir = '/dbfs/tmp/model'
if os.path.exists(dir):
    shutil.rmtree(dir)
os.mkdir(dir)

In [15]:
# Input data pipeline config
TRAIN_BATCH_SIZE = 64 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 250 #@param {type:"integer"}


model_weights_dir = '/dbfs/tmp/modelweights'
pretraining_data_dir = '/dbfs/mnt/pretrainingbasedata'

VOCAB_FILE = language_maps_dir + '/vocab_file.csv'
CONFIG_FILE = language_maps_dir + '/bert_config.json'

INIT_CHECKPOINT = tf.train.latest_checkpoint(model_weights_dir)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(pretraining_data_dir,'*tfrecord'))

USE_TPU = False

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [17]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [18]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

In [19]:
model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      learning_rate=LEARNING_RATE,
      num_train_steps=TRAIN_STEPS,
      num_warmup_steps=10,
      use_tpu=USE_TPU,
      use_one_hot_embeddings=True)

run_config = tf.contrib.tpu.RunConfig(
    model_dir=model_weights_dir,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    keep_checkpoint_max=5,
    keep_checkpoint_every_n_hours=1,
    log_step_count_steps=100)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=MAX_SEQ_LENGTH,
        max_predictions_per_seq=MAX_PREDICTIONS,
        is_training=True)

In [20]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)