In [24]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text1 = 'I am learning AI'
text2 = 'It is interesting'
bert_input = tokenizer(text1, text2, 
                       padding='max_length', 
                       max_length=15, 
                       truncation=True)


print(bert_input['input_ids'])      # indices
print(bert_input['token_type_ids']) # sequence
print(bert_input['attention_mask']) # pad and no-pad

[101, 1045, 2572, 4083, 9932, 102, 2009, 2003, 5875, 102, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [32]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = 'Learning AI is interesting'
bert_input = tokenizer(text, 
                       padding='max_length', 
                       max_length=10, 
                       truncation=True)

print(bert_input['input_ids'])      # indices
print(bert_input['token_type_ids']) # sequence
print(bert_input['attention_mask']) # pad and no-pad

[101, 4083, 9932, 2003, 5875, 102, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


In [None]:
def convert_text_to_feature(review, tokenizer, max_length):  
    return tokenizer.encode_plus(review,
                                add_special_tokens = True,
                                max_length = max_length,
                                padding='max_length',
                                truncation=True,
                                return_attention_mask = True)

def map_feature_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {"input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_masks,}, label

def encode_text(ds, tokenizer, max_length):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
        
    for review, label in ds:
        bert_input = convert_text_to_feature(review[0].numpy().decode(), 
                                                   tokenizer, max_length)
    
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label[0].numpy()])

    return tf.data.Dataset.from_tensor_slices(
                (input_ids_list, 
                attention_mask_list, 
                token_type_ids_list, label_list)).map(map_feature_to_dict)
                
                
                