# **A toy example for HW7 Bert QA**

If you have any questions, feel free to email us at ntu-ml-2021spring-ta@googlegroups.com

# Install transformers
Documentation for the toolkit:　https://huggingface.co/transformers/

In [1]:
!pip install transformers==4.5.0

Looking in indexes: https://pypi.doubanio.com/simple/


# Import Packages

In [2]:
import torch
from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering

# Load Model and Tokenizer
A list of avaliable pretrained models: https://huggingface.co/models

In [3]:
# model_name can be either: models in huggingface model hub or models saved using save_pretrained
model_name = 'bert-base-chinese'
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [4]:
chi_tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
eng_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Tokenize

In [5]:
chi_paragraph = '李宏毅幾班大金。2021 ML'
tokens = chi_tokenizer.tokenize(chi_paragraph)
print(tokens)
chi_tokenizer.convert_tokens_to_ids(tokens)

['李', '宏', '毅', '幾', '班', '大', '金', '。', '2021', '[UNK]']


[3330, 2131, 3675, 2407, 4408, 1920, 7032, 511, 9960, 100]

In [6]:
eng_paragraph = 'Lee Hung-yi which class Daikin.'
tokens = eng_tokenizer.tokenize(eng_paragraph)
print(tokens)
eng_tokenizer.convert_tokens_to_ids(tokens)

['Lee', 'Hung', '-', 'y', '##i', 'which', 'class', 'Dai', '##kin', '.']


[2499, 26157, 118, 194, 1182, 1134, 1705, 23084, 4314, 119]

# Encode vs Decode

In [7]:
question = '李宏毅幾班?'
paragraph = '李宏毅幾班大金。'
encoded = chi_tokenizer.encode(question, paragraph)
decoded = chi_tokenizer.decode(encoded)
print(encoded)
print(decoded)

[101, 3330, 2131, 3675, 2407, 4408, 136, 102, 3330, 2131, 3675, 2407, 4408, 1920, 7032, 511, 102]
[CLS] 李 宏 毅 幾 班? [SEP] 李 宏 毅 幾 班 大 金 。 [SEP]


# Model Inputs

In [8]:
inputs = chi_tokenizer(question, paragraph, return_tensors='pt') # set return type as PyTorch
# Indices of input sequence tokens in the vocabulary
print('Input ids:      ', inputs['input_ids'])
# Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
print('Token type ids: ', inputs['token_type_ids'])
# Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
print('Attention mask: ', inputs['attention_mask'])

Input ids:       tensor([[ 101, 3330, 2131, 3675, 2407, 4408,  136,  102, 3330, 2131, 3675, 2407,
         4408, 1920, 7032,  511,  102]])
Token type ids:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Attention mask:  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


# Testing (Chinese)

In [14]:
question = '李宏毅幾班?'
paragraph = '李宏毅幾班大金。'
inputs = chi_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)
# output = model(input_ids=inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = chi_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

start_logits: 
tensor([[-0.4426, -1.2055, -0.9898, -0.8487,  0.0632, -0.5500, -0.3370, -0.7374,
         -1.0589, -1.1773, -0.8922,  0.0775, -0.1733,  1.7117, -0.5467, -0.2955,
         -0.7374]])
end_logits: 
tensor([[-0.7634, -1.4479, -1.3643, -1.3158, -0.7851, -0.3256, -0.4606, -2.0774,
         -1.2740, -1.5332, -1.6222, -0.7891, -0.5642,  0.1027,  2.3907, -0.5236,
         -2.0774]])
start position:  13
end position:    14
predict_id:      tensor([1920, 7032])
predict_answer:  大 金


# Training (Chinese)
For Question Answering, loss is the sum of cross entropy between the model prediction and correct answer

In [10]:
output = model(**inputs, start_positions=torch.tensor([13]), end_positions=torch.tensor([14]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

loss:  tensor(2.7629, grad_fn=<DivBackward0>)


# Testing (English)

In [36]:
question = "Why does Jeanie like Tom?"
paragraph = "Jeanie likes Tom because Tom is good at deep learning."
inputs = eng_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)
# output = model(input_ids=inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = eng_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

start_logits: 
tensor([[ 0.3904,  0.7138,  0.2942, -0.3142, -0.2484,  0.2123,  0.8086,  0.3086,
         -0.2774,  0.0656, -0.0531,  0.1067,  1.4656,  0.5646,  2.3540,  0.6178,
          0.1550, -0.2313,  0.2991, -0.4581, -0.4918, -0.2774]])
end_logits: 
tensor([[-1.5364, -1.3534, -1.6117, -2.0081, -1.7822, -1.8132, -1.0893, -1.2300,
         -2.0936, -1.7374, -1.5145, -1.4858, -1.4943, -1.4478, -1.1967, -0.8906,
         -0.6637, -0.0219,  1.5808,  2.4958, -0.9445, -2.0936]])
start position:  14
end position:    19
predict_id:      tensor([2545, 1110, 1363, 1120, 1996, 3776])
predict_answer:  Tom is good at deep learning


# Training (English)
For Question Answering, loss is the sum of cross entropy between the model prediction and correct answer

In [12]:
output = model(**inputs, start_positions=torch.tensor([14]), end_positions=torch.tensor([19]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

loss:  tensor(2.9497, grad_fn=<DivBackward0>)
