# Simple note on how to use pre-trained models from hugginface

### Tokenizer

- Can tokenize one or a batch of sentences at one time

In [12]:
from transformers import AutoTokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

Downloading: 100%|██████████| 624/624 [00:00<00:00, 626kB/s]
Downloading: 100%|██████████| 110k/110k [00:00<00:00, 147kB/s] 
Downloading: 100%|██████████| 269k/269k [00:01<00:00, 265kB/s]


In [9]:
input_tokens = tokenizer("我就想回到雀豪就不打了")
for key in input_tokens:
    print(key, " : ", input_tokens[key])

input_ids  :  [101, 2769, 2218, 2682, 1726, 1168, 7411, 6498, 2218, 679, 2802, 749, 102]
token_type_ids  :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask  :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [10]:
tokenizer.decode(input_tokens['input_ids'])

'[CLS] 我 就 想 回 到 雀 豪 就 不 打 了 [SEP]'

In [19]:
batch_sentences = ["只遇到了复读机和饭",
                   "第二个好像是",
                   "我才把连斩用了",
                   "19层被团灭"]
encoded_batch_inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
for key in encoded_batch_inputs:
    print(key, " : ", encoded_batch_inputs[key])

input_ids  :  tensor([[ 101, 1372, 6878, 1168,  749, 1908, 6438, 3322, 1469, 7649,  102],
        [ 101, 5018,  753,  702, 1962, 1008, 3221,  102,    0,    0,    0],
        [ 101, 2769, 2798, 2828, 6825, 3168, 4500,  749,  102,    0,    0],
        [ 101, 8131, 2231, 6158, 1730, 4127,  102,    0,    0,    0,    0]])
token_type_ids  :  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask  :  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])


In [20]:
qes_input = "哪个是咕咕"
ans_input = "人类的本质"
encoded_qa = tokenizer(qes_input, ans_input)
for key in encoded_qa:
    print(key, " : ", encoded_qa[key])

input_ids  :  [101, 1525, 702, 3221, 1475, 1475, 102, 782, 5102, 4638, 3315, 6574, 102]
token_type_ids  :  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
attention_mask  :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [22]:
tokenizer.decode(encoded_qa['input_ids'])

'[CLS] 哪 个 是 咕 咕 [SEP] 人 类 的 本 质 [SEP]'

In [25]:
qes_batch = ["哪个是咕咕",
             "她微博回应了写啥"]
ans_batch = ["人类的本质",
             "就阿巴阿巴阿巴"]
encoded_batch_qa = tokenizer(qes_batch, ans_batch, padding='max_length', max_length=20, truncation=True, return_tensors="pt")
for key in encoded_batch_qa:
    print(key, " : ", encoded_batch_qa[key])

input_ids  :  tensor([[ 101, 1525,  702, 3221, 1475, 1475,  102,  782, 5102, 4638, 3315, 6574,
          102,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 1961, 2544, 1300, 1726, 2418,  749, 1091, 1567,  102, 2218, 7350,
         2349, 7350, 2349, 7350, 2349,  102,    0,    0]])
token_type_ids  :  tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
attention_mask  :  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


### How to load a pre-trained model and use it

In [1]:
import torch
from transformers import BertForSequenceClassification, AdamW
from transformers import BertTokenizer
from torch.nn import functional as F 

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
model.train()

In [None]:
# optimizer = AdamW(model.parameters(), lr=2e-5)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

batch_sentences = ["只遇到了复读机和饭",
                   "第二个好像是",
                   "我才把连斩用了",
                   "我们进门挂机1分钟然后直奔领奖台",
                   "钓鱼执法是吧",
                   "19层被团灭"]
labels = torch.Tensor([1, 0, 0, 1, 0, 0])

encodings = tokenizer(batch_sentences, padding='max_length', max_length=20, truncation=True, return_tensors='pt')

In [None]:
input_ids = encodings['input_ids']
input_mask = encodings['attention_mask']

outputs = model(input_ids, attention_mask=input_mask)
# loss = outputs.loss
loss = F.cross_entropy(outputs.logits, labels)
loss.backward()
optimizer.step()

In [None]:
model.save_pretrained('./my_mrpc_model/')
pytorch_model = BertForSequenceClassification.from_pretrained('./my_mrpc_model/', from_tf=True)