# **A toy example for HW7 Bert QA**

If you have any questions, feel free to email us at ntu-ml-2021spring-ta@googlegroups.com

# Install transformers
Documentation for the toolkit:　https://huggingface.co/transformers/

In [None]:
!pip install transformers==4.5.0

# Import Packages

In [1]:
import torch
from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering

# Load Model and Tokenizer
A list of avaliable pretrained models: https://huggingface.co/models

In [2]:
# model_name can be either: models in huggingface model hub or models saved using save_pretrained
model_name = 'bert-base-chinese'
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [3]:
chi_tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
eng_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading: 100%|██████████| 213k/213k [00:08<00:00, 23.8kB/s]
Downloading: 100%|██████████| 436k/436k [00:28<00:00, 15.1kB/s]
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 29.2kB/s]


# Tokenize

In [4]:
chi_paragraph = '李宏毅幾班大金。2021 ML'
tokens = chi_tokenizer.tokenize(chi_paragraph)
print(tokens)
chi_tokenizer.convert_tokens_to_ids(tokens)

['李', '宏', '毅', '幾', '班', '大', '金', '。', '2021', '[UNK]']


[3330, 2131, 3675, 2407, 4408, 1920, 7032, 511, 9960, 100]

In [5]:
eng_paragraph = 'Lee Hung-yi which class Daikin.'
tokens = eng_tokenizer.tokenize(eng_paragraph)
print(tokens)
eng_tokenizer.convert_tokens_to_ids(tokens)

['Lee', 'Hung', '-', 'y', '##i', 'which', 'class', 'Dai', '##kin', '.']


[2499, 26157, 118, 194, 1182, 1134, 1705, 23084, 4314, 119]

# Encode vs Decode

In [6]:
question = '李宏毅幾班?'
paragraph = '李宏毅幾班大金。'
encoded = chi_tokenizer.encode(question, paragraph)
decoded = chi_tokenizer.decode(encoded)
print(encoded)
print(decoded)

[101, 3330, 2131, 3675, 2407, 4408, 136, 102, 3330, 2131, 3675, 2407, 4408, 1920, 7032, 511, 102]
[CLS] 李 宏 毅 幾 班? [SEP] 李 宏 毅 幾 班 大 金 。 [SEP]


# Model Inputs

In [7]:
inputs = chi_tokenizer(question, paragraph, return_tensors='pt') # set return type as PyTorch
# Indices of input sequence tokens in the vocabulary
print('Input ids:      ', inputs['input_ids'])
# Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
print('Token type ids: ', inputs['token_type_ids'])
# Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
print('Attention mask: ', inputs['attention_mask'])

Input ids:       tensor([[ 101, 3330, 2131, 3675, 2407, 4408,  136,  102, 3330, 2131, 3675, 2407,
         4408, 1920, 7032,  511,  102]])
Token type ids:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Attention mask:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


# Testing (Chinese)

In [8]:
question = '李宏毅幾班?'
paragraph = '李宏毅幾班大金。'
inputs = chi_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)
# output = model(input_ids=inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = chi_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

start_logits: 
tensor([[-0.6262, -0.7415, -0.7111, -0.6126, -0.6379, -0.6177, -0.9293, -0.5159,
         -0.7238, -0.6336, -0.4981, -0.5196, -0.4478, -0.7371, -1.0419, -0.5336,
         -0.5159]])
end_logits: 
tensor([[-0.0684,  0.7831,  0.3180,  0.5149,  0.6406, -0.1122,  0.7964, -0.2628,
          0.1568,  0.3487,  0.5829,  0.6321, -0.0010,  0.0307,  0.2376,  0.4377,
         -0.2628]])
start position:  12
end position:    6
predict_id:      tensor([], dtype=torch.int64)
predict_answer:  


# Training (Chinese)
For Question Answering, loss is the sum of cross entropy between the model prediction and correct answer

In [9]:
output = model(**inputs, start_positions=torch.tensor([13]), end_positions=torch.tensor([14]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

loss:  tensor(2.9323, grad_fn=<DivBackward0>)


# Testing (English)

In [10]:
question = "Why does Jeanie like Tom?"
paragraph = "Jeanie likes Tom because Tom is good at deep learning."
inputs = eng_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)
# output = model(input_ids=inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = eng_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

start_logits: 
tensor([[-0.8070, -0.4859, -0.9686, -0.7980, -0.8325, -0.4310, -0.2878, -0.9313,
         -0.7036, -0.5790, -0.7574, -0.3716, -0.3959, -0.5793, -0.7177, -0.7736,
         -0.6535, -0.4270, -0.2888, -0.3633, -0.8827, -0.7036]])
end_logits: 
tensor([[-0.0328,  0.5448,  0.0546,  0.2758,  0.6124,  0.3178,  0.8662,  0.5064,
          0.0151,  0.2450,  0.4484,  0.2418,  0.6935,  0.2410,  0.7174,  0.6489,
          0.5701,  0.7711,  0.2106,  0.4943,  0.2235,  0.0151]])
start position:  6
end position:    6
predict_id:      tensor([2545])
predict_answer:  Tom


# Training (English)
For Question Answering, loss is the sum of cross entropy between the model prediction and correct answer

In [11]:
output = model(**inputs, start_positions=torch.tensor([14]), end_positions=torch.tensor([19]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

loss:  tensor(3.1155, grad_fn=<DivBackward0>)
