In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import json
from pathlib import Path
from transformers import DistilBertTokenizerFast
import torch
from transformers import DistilBertForQuestionAnswering
from torch.utils.data import DataLoader
from transformers import AdamW
import numpy as np

In [3]:
!mkdir squad

In [4]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

--2023-10-22 14:03:53--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2023-10-22 14:03:55 (161 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2023-10-22 14:03:55--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2023-10-22 14:03:55 (68.9 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/4370528]



In [5]:
def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers


train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

**Tokenize and positional encoding**

In [6]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx - 1:end_idx - 1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1  # When the gold label is off by one character
        elif context[start_idx - 2:end_idx - 2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2  # When the gold label is off by two characters


add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

**Dataloader**

In [7]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

**Model - Distil BERT**

In [8]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [9]:
optim = AdamW(model.parameters(), lr=5e-5)



**Train**

In [10]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)
total_step = len(train_loader)

In [18]:
EPOCH = 5
for epoch in range(EPOCH):
    for i,batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        if i % 200 == 0:
          print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                  .format(epoch, EPOCH, i, total_step, loss.item(), np.exp(loss.item())))

Epoch [0/5], Step [0/2895], Loss: 0.7113, Perplexity: 2.0367
Epoch [0/5], Step [200/2895], Loss: 0.3378, Perplexity: 1.4018
Epoch [0/5], Step [400/2895], Loss: 0.2058, Perplexity: 1.2285
Epoch [0/5], Step [600/2895], Loss: 0.6182, Perplexity: 1.8555
Epoch [0/5], Step [800/2895], Loss: 0.8222, Perplexity: 2.2754
Epoch [0/5], Step [1000/2895], Loss: 0.3513, Perplexity: 1.4210
Epoch [0/5], Step [1200/2895], Loss: 0.3956, Perplexity: 1.4853
Epoch [0/5], Step [1400/2895], Loss: 0.5230, Perplexity: 1.6871
Epoch [0/5], Step [1600/2895], Loss: 0.7108, Perplexity: 2.0355
Epoch [0/5], Step [1800/2895], Loss: 0.8917, Perplexity: 2.4392
Epoch [0/5], Step [2000/2895], Loss: 0.5101, Perplexity: 1.6654
Epoch [0/5], Step [2200/2895], Loss: 0.7632, Perplexity: 2.1451
Epoch [0/5], Step [2400/2895], Loss: 0.2355, Perplexity: 1.2656
Epoch [0/5], Step [2600/2895], Loss: 0.5974, Perplexity: 1.8174
Epoch [0/5], Step [2800/2895], Loss: 0.7580, Perplexity: 2.1340
Epoch [1/5], Step [0/2895], Loss: 0.1148, Perpl

KeyboardInterrupt: ignored

**Test**

In [23]:
text = "Asia is the largest and most populated continent. It has nearly one-third of the world’s total land area and " \
       "is home to more than half of Earth’s people. It also has impressive geographical features. It has Earth's " \
       "highest point is Mount Everest and lowest point is the Dead Sea. Asia also includes some of the world’s " \
       "wettest, driest, hottest, and coldest places. The continent was the home of the great early civilizations of " \
       "Mesopotamia and the Indus River valley. The world’s major religions Buddhism, Christianity, Hinduism, Islam, " \
       "and Judaism—all began in Asia as well. Today, though many people are farmers and live in small villages, " \
       "Asia also has enormous cities, including some of the world’s largest: Beijing, China; Tokyo, Japan; Seoul, " \
       "South Korea; and Delhi, India. "

ques = "what is the earths highest point?"

encodings = tokenizer.encode_plus(ques, text)

inputIds, attentionMask = encodings["input_ids"], encodings["attention_mask"]

start_scores = model(input_ids=torch.tensor([inputIds]).to(device))[0]
end_scores = model(input_ids=torch.tensor([inputIds]).to(device))[1]

tokens = inputIds[torch.argmax(start_scores): torch.argmax(end_scores) + 1]

answerTokens = tokenizer.convert_ids_to_tokens(tokens, skip_special_tokens=True)

ans = tokenizer.convert_tokens_to_string(answerTokens)

print(ans)

what is the earths highest point? asia is the largest and most populated continent. it has nearly one - third of the world ’ s total land area and is home to more than half of earth ’ s people. it also has impressive geographical features. it has earth ' s highest point is mount everest
