In [6]:
sentence = "hello everyone , today is a good day ."

In [7]:
vocab = {
    '<SOS>':0,
    'EOS': 1,
    'hello': 2,
    'everyone':3,
    'today':4,
    'is':5,
    'a':6,
    'good':7,
    'day':8,
    ',':9,
    '.':10
}

In [8]:
sentence = '<SOS> ' + sentence + ' <EOS>'
words = sentence.split()
print(words)

['<SOS>', 'hello', 'everyone', ',', 'today', 'is', 'a', 'good', 'day', '.', '<EOS>']


In [9]:
[w for w in words]

['<SOS>',
 'hello',
 'everyone',
 ',',
 'today',
 'is',
 'a',
 'good',
 'day',
 '.',
 '<EOS>']

In [11]:
# 使用bert-base-chinese
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="bert-base-chinese", # 模型名称或者地址
    cache_dir=None, # 缓存地址
    force_download=False # 是否强制下载
)

In [29]:
sentences = [
    '你站在桥上看风景，',
    '看风景的人在楼上看你，',
    '明月装饰了你的窗子',
    '你装饰了别人的梦'
]

### Tokenizer

In [16]:
output = tokenizer.encode(
    text=sentences[0],
    text_pair=sentences[1],
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    max_length=25,
    return_tensors=None
)
print(output)

[101, 872, 4991, 1762, 3441, 677, 4692, 7599, 3250, 8024, 102, 4692, 7599, 3250, 4638, 782, 1762, 3517, 677, 4692, 872, 8024, 102, 0, 0]


In [23]:
tokenizer.decode(
    token_ids=output,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
).replace(" ","")

'你站在桥上看风景，看风景的人在楼上看你，'

In [28]:
output = tokenizer.encode_plus(
    text=sentences[0],
    text_pair=sentences[1],
    max_length=25,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors=None,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    return_length=True
)
for key, value in output.items():
    print(f'{key}: {value}')

input_ids: [101, 872, 4991, 1762, 3441, 677, 4692, 7599, 3250, 8024, 102, 4692, 7599, 3250, 4638, 782, 1762, 3517, 677, 4692, 872, 8024, 102, 0, 0]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
special_tokens_mask: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
length: 25


### 批量编码

In [36]:
sentences = [
    '你站在桥上看风景，',
    '看风景的人在楼上看你，',
    '明月装饰了你的窗子',
    '你装饰了别人的梦'
]

text_pairs = [(sentences[0], sentences[1]), (sentences[2], sentences[3])]

outputs = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs= text_pairs,
    max_length=25,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors=None,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    return_length=True
)

for key, value in outputs.items():
    print(f'{key}: {value}')

input_ids: [[101, 872, 4991, 1762, 3441, 677, 4692, 7599, 3250, 8024, 102, 4692, 7599, 3250, 4638, 782, 1762, 3517, 677, 4692, 872, 8024, 102, 0, 0], [101, 3209, 3299, 6163, 7652, 749, 872, 4638, 4970, 2094, 102, 872, 6163, 7652, 749, 1166, 782, 4638, 3457, 102, 0, 0, 0, 0, 0]]
token_type_ids: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]
special_tokens_mask: [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]
length: [23, 20]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]


### 获取字典

In [46]:
vocab = tokenizer.get_vocab()
print(len(vocab))
# print("-"*30)
# print(vocab)

21128


### 新增词元编码

In [50]:
tokenizer.add_tokens(new_tokens=['明月','装饰','窗子'])
tokenizer.add_special_tokens({'eos_token': '[EOS]'})

1

In [51]:
for word in ['明月','装饰','窗子']:
    print(tokenizer.get_vocab()[word])

21128
21129
21130


In [54]:
encode_text = tokenizer.encode_plus(
    text='明月装饰了你的窗子[EOS]',
    max_length=25,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors=None,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    return_length=True
)
decode_text = tokenizer.decode(encode_text['input_ids'],skip_special_tokens=True)
print(decode_text)

明月 装饰 了 你 的 窗子
