In [1]:
import tensorflow as tf
from tensorflow.keras import layers

#from tensorflow.compat.v1 import ConfigProto
#from tensorflow.compat.v1 import InteractiveSession

#config = ConfigProto()
#config.gpu_options.allow_growth = True
#session = InteractiveSession(config=config)

  from ._conv import register_converters as _register_converters


In [2]:
tf.__version__

'2.0.0'

In [3]:
x = tf.random.uniform([3, 3])

In [4]:
print("Is there a GPU available: "),
print(tf.test.is_gpu_available())

print("Is the Tensor on GPU #0:  "),
print(x.device.endswith('GPU:0'))

print("Device name: {}".format((x.device)))

Is there a GPU available: 
True
Is the Tensor on GPU #0:  
True
Device name: /job:localhost/replica:0/task:0/device:GPU:0


In [5]:
import os
import sys
import sys
import os

# Add the path to system, local or mounted S3 bucket, e.g. /dbfs/mnt/<path_to_bucket>
sys.path.append(os.path.join(os.getcwd(),"bert"))
import numpy as np
import json
import nltk
import pandas as pd
import csv
import random
import logging
from collections import Counter
import pathlib
import pickle

import modeling, optimization, tokenization
from run_pretraining import input_fn_builder, model_fn_builder

from text_preprocessing import tokenizer_word
from language_model_processing import read_raw_data_preprocess_and_save, create_vocab_df
from bpe import create_token_vocabulary, get_stats, merge_vocab, Encoder

In [6]:
language_maps_dir = "models/base/master/language_maps/"

def save_obj(obj, directory, name):
    with open(directory / "{}.pkl".format(name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name, directory):
    with open(os.path.join(directory, name + '.pkl'), 'rb') as f:
        return pickle.load(f)
      
vocab_to_id = load_obj('vocab_to_id', str(language_maps_dir))
print('Vocab Size:', len(vocab_to_id))

import json

bert_base_config = {
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": len(vocab_to_id)
}

with open(os.path.join(language_maps_dir, 'bert_config.json'), 'w') as f:
    json.dump(bert_base_config, f)
    
print(bert_base_config)
####################################load_vocab

Vocab Size: 31503
{'attention_probs_dropout_prob': 0.1, 'directionality': 'bidi', 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'pooler_fc_size': 768, 'pooler_num_attention_heads': 12, 'pooler_num_fc_layers': 3, 'pooler_size_per_head': 128, 'pooler_type': 'first_token_transform', 'type_vocab_size': 2, 'vocab_size': 31503}


In [7]:
import modeling, optimization, tokenization

testcase = "Olá isso é mais uma BAGUNCA 😂😂😂"
bert_tokenizer = tokenization.FullTokenizer(language_maps_dir)
print(testcase)
print(bert_tokenizer.tokenize(testcase))

Olá isso é mais uma BAGUNCA 😂😂😂
['olá', 'isso', 'é', 'mais', 'uma', 'bagun', 'ca', '😂', '😂', '😂']


In [8]:
model_weights_dir = "models/base/master/model_weights/"
pretraining_data_dir = 'models/base/master/pretraining_base_data'

VOCAB_FILE = language_maps_dir + '/vocab_file.csv'
CONFIG_FILE = language_maps_dir + '/bert_config.json'

INIT_CHECKPOINT = tf.train.latest_checkpoint(model_weights_dir)

MINI_BATCH_SIZE = 32

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.io.gfile.glob(os.path.join(pretraining_data_dir,'*tfrecord'))

In [9]:
def tf_record_to_memory(input_files,
                        max_seq_length = 128,
                        max_predictions_per_seq = 20,
                        num_cpu_threads = 4,
                        batch_size = MINI_BATCH_SIZE):
    
    d = tf.data.TFRecordDataset(input_files)

    name_to_features = {
        "input_ids":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "segment_ids":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
        "masked_lm_positions":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
        "masked_lm_ids":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
        "masked_lm_weights":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
        "next_sentence_labels":
            tf.io.FixedLenFeature([1], tf.int64),
    }
    parsed_dataset = d.map(
        map_func = lambda record: tf.io.parse_single_example(record, name_to_features),
        num_parallel_calls=num_cpu_threads)
    parsed_dataset = parsed_dataset.batch(drop_remainder=True, 
                                          batch_size=batch_size)
    
    return parsed_dataset

def get_single_sentence_training_data(i, s):
    input_ids = tf.boolean_mask(
        i,
        1-s,
        axis=None,
        name='boolean_mask')
    x = tf.slice(input_ids, [1], [input_ids.shape[0]-2], name='x')
    x = tf.pad(x, [[0, 128-input_ids.shape[0]+2]], "CONSTANT", constant_values=0)
    y = input_ids[-2]
        
    return x, y

parsed_dataset = tf_record_to_memory(input_files)

# Get single mini batch to test

In [10]:
for bn, batch in enumerate(parsed_dataset.take(MINI_BATCH_SIZE)):
    X = []
    Y = []
    ips = batch['input_ids']
    sms = batch['segment_ids']
    for i, s in zip(ips, sms):
        x, y = get_single_sentence_training_data(i, s)
        X.append(x)
        Y.append(y)
    X = tf.stack(X)
    Y = tf.stack(Y)
    break
    
X.shape

TensorShape([32, 128])

In [11]:
model = tf.keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 64.
model.add(layers.Embedding(input_dim=len(vocab_to_id), output_dim=64))

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(128))

# Add a Dense layer with 10 units and softmax activation.
model.add(layers.Dense(len(vocab_to_id), activation='softmax'))

model.summary()
    
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), #tf.keras.loss
              optimizer='adam', #tf.keras.optimizers
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          2016192   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 31503)             4063887   
Total params: 6,178,895
Trainable params: 6,178,895
Non-trainable params: 0
_________________________________________________________________


In [12]:
model(X)

UnknownError: Fail to find the dnn implementation. [Op:CudnnRNN]

In [None]:
for bn, batch in enumerate(parsed_dataset.repeat().batch(MINI_BATCH_SIZE).take(MINI_BATCH_SIZE)):
    X = []
    Y = []
    ips = batch['input_ids']
    sms = batch['segment_ids']
    for i, s in zip(ips, sms):
        x, y = get_single_sentence_training_data(i, s)
        X.append(x)
        Y.append(y)
    X = tf.stack(X)
    Y = tf.stack(Y)
    