In [1]:
from transformers import BertTokenizer, BertForQuestionAnswering

import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader, TensorDataset

import math
import string

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [3]:
from datasets import load_dataset

# 下载并加载SQuAD数据集
squad_dataset = load_dataset('squad')

train_squad = squad_dataset['train']
test_squad = squad_dataset['validation']

# 取出一部分数据
train_temp = train_squad[:1000]
test_temp = test_squad[:100]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [5]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizer中标点符号算token，但是空格不算

train_encoded = tokenizer(text=train_squad['context'][:1000],
                          text_pair=train_squad['question'][:1000],
                          truncation=True, return_tensors='pt',
                          max_length=512,pad_to_max_length=True)

test_encoded = tokenizer(text = test_squad['context'][:100],
                         text_pair = test_squad['question'][:100],
                         truncation=True, return_tensors='pt',
                         max_length=512,pad_to_max_length=True)



In [6]:
def count_words(text):

    # 使用split()函数将文本分割成单词列表
    words = text.split()

    # 返回单词的数量
    return len(words)

def count_words_and_punctuations(idx,text):

    # 截取到指定索引位置的文本
    sub_text = text[:idx]

    # 使用split()函数将文本分割成单词列表
    words = sub_text.split()

    # 计算单词数量
    word_count = len(words)

    # 计算标点符号数量
    punctuation_count = sum(1 for char in sub_text if char in string.punctuation)

    return word_count+punctuation_count

In [9]:
train_start = [count_words_and_punctuations(train_temp['answers'][i]['answer_start'][0],
                      train_temp['context'][i]) for i in range(1000)]
train_end = [count_words_and_punctuations(train_temp['answers'][i]['answer_start'][0] + count_words(train_temp['answers'][i]['text'][0]),
                      train_temp['context'][i]) for i in range(1000)]

test_start = [count_words_and_punctuations(test_temp['answers'][i]['answer_start'][0],
                      test_temp['context'][i]) for i in range(100)]
test_end = [count_words_and_punctuations(test_temp['answers'][i]['answer_start'][0] + count_words(test_temp['answers'][i]['text'][0]),
                    test_temp['context'][i]) for i in range(100)]

## Dataset Dataloader

In [10]:
train_data = TensorDataset(train_encoded['input_ids'],
                     train_encoded['attention_mask'],
                     train_encoded['token_type_ids'],
                     torch.tensor(train_start),
                     torch.tensor(train_end))

test_data = TensorDataset(test_encoded['input_ids'],
                    test_encoded['attention_mask'],
                    test_encoded['token_type_ids'],
                    torch.tensor(test_start),
                    torch.tensor(test_end))

train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=True)

## Train

In [11]:
from tqdm.auto import tqdm

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').cuda()
optim = torch.optim.Adam(model.parameters(), lr=0.0001)

for ep in tqdm(range(10)):
    total_loss = 0

    for b in tqdm(train_dataloader):
        # input_ids, attention_mask, token_type_ids, start, end
        iids, am, tids, s, e = b

        out = model(input_ids=iids.cuda(),
                    attention_mask=am.cuda(),
                    token_type_ids=tids.cuda(),
                    start_positions=s.cuda(),
                    end_positions = e.cuda())

        optim.zero_grad()
        out.loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optim.step()

        total_loss += out.loss.item()
    print(f'Epoch {ep+1}/{10}, Loss: {total_loss}')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 1/10, Loss: 611.3763415813446


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 2/10, Loss: 442.5435149669647


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 3/10, Loss: 279.03627145290375


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 4/10, Loss: 180.1372662782669


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 5/10, Loss: 120.60911577939987


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 6/10, Loss: 100.71551643684506


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 7/10, Loss: 80.9587772404775


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 8/10, Loss: 69.25622797105461


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 9/10, Loss: 54.28551309136674


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 10/10, Loss: 56.025272326078266


## Inference

In [15]:
# 问题和上下文
model.cpu()
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

# tokenize 问题和上下文
encoded_dict = tokenizer.encode_plus(text=text, text_pair=question,
                    truncation=True, return_tensors='pt',
                    max_length=512, pad_to_max_length=True)

with torch.no_grad():
    outputs = model(**encoded_dict)

# 预测答案的起始和结束位置
start_index = outputs.start_logits.argmax()
end_index = outputs.end_logits.argmax()

print(start_index, end_index)

# 使用tokenizer解码，以获取原始文本中的答案
predict_answer_tokens = encoded_dict.input_ids[0, start_index:end_index+1]
tokenizer.decode(predict_answer_tokens)



tensor(0) tensor(1)


'[CLS] jim'