![pipeline](image.png)


combine context + question -> model -> dự đoán ra 2 vị trí đầu và cuối của câu trả lời trong context 

### 1. Data

In [15]:
qa_dataset = [
    {
        'context': 'My name is AIVN and I am from Vietnam.',
        'question': 'What is my name?',
        'answer': 'AIVN'
    },
    {
        'context': 'I love painting and my favorite artist is Vincent Van Gogh.',
        'question': 'What is my favorite activity?',
        'answer': 'painting'
    },
    {
        'context': 'I am studying computer science at the University of Tokyo.',
        'question': 'What am I studying?',
        'answer': 'computer science'
    },
    {
        'context': 'My favorite book is "To Kill a Mockingbird" by Harper Lee.',
        'question': 'What is my favorite book?', 
        'answer': '"To Kill a Mockingbird"'
    }
]

### 2. Vectorization

In [18]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


tokenier = get_tokenizer("basic_english")

def yield_tokens(examples: list):
    for item in examples:
        yield tokenier(item["context"] + " <sep> " + item["question"])


vocab = build_vocab_from_iterator(
    iterator=yield_tokens(qa_dataset), 
    specials=["<unk>", "<pad>", "<bos>", "<eos>", "<sep>"]
)
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'vincent': 40,
 'vietnam': 39,
 'university': 37,
 'to': 35,
 'the': 34,
 'painting': 32,
 'of': 31,
 'mockingbird': 30,
 'am': 12,
 'what': 11,
 '<sep>': 4,
 '<bos>': 2,
 'science': 33,
 '?': 8,
 'my': 6,
 'is': 5,
 'at': 21,
 'gogh': 25,
 'love': 29,
 'lee': 28,
 '.': 7,
 '<eos>': 3,
 '<pad>': 1,
 'computer': 23,
 'artist': 20,
 'favorite': 9,
 'harper': 26,
 '<unk>': 0,
 'and': 13,
 'studying': 16,
 'i': 10,
 'aivn': 19,
 'van': 38,
 'book': 14,
 'tokyo': 36,
 'name': 15,
 'kill': 27,
 'by': 22,
 'a': 17,
 'from': 24,
 'activity': 18}

In [19]:
PAD_IDX = vocab["<pad>"]

def pad_and_truncate(input_ids: list[int], max_seq_len: int):
    if len(input_ids) > max_seq_len:
        input_ids = input_ids[:max_seq_len]
    else:
        input_ids = input_ids + [PAD_IDX] * (max_seq_len - len(input_ids))
    return input_ids


MAX_SEQ_LEN = 22
text = "I love AIVN"
tokenized_text = tokenier(text)
tokens = [vocab[token] for token in tokenized_text]
print(tokens)
tokens = pad_and_truncate(input_ids=tokens, max_seq_len=MAX_SEQ_LEN)
print(tokens)

[10, 29, 19]
[10, 29, 19, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [26]:
import torch


def vectorize(question: str, context: str, answer: str):
    
    input_text = question + " <sep> " + context
    input_ids = [vocab[token] for token in tokenier(input_text)]
    input_ids = pad_and_truncate(input_ids=input_ids, max_seq_len=MAX_SEQ_LEN)

    answer_ids = [vocab[token] for token in tokenier(answer)]
    st_pos = input_ids.index(answer_ids[0])
    end_pos = st_pos + len(answer_ids) - 1

    input_ids = torch.tensor(input_ids, dtype=torch.long)
    st_pos = torch.tensor(st_pos, dtype=torch.long)
    end_pos = torch.tensor(end_pos, dtype=torch.long)
    return input_ids, st_pos, end_pos

input_ids, st_pos, end_pos = vectorize(
    question=qa_dataset[0]['question'],
    context=qa_dataset[0]['context'], 
    answer=qa_dataset[0]['answer']
)
print(input_ids)
print(st_pos)
print(end_pos)

tensor([11,  5,  6, 15,  8,  4,  6, 15,  5, 19, 13, 10, 12, 24, 39,  7,  1,  1,
         1,  1,  1,  1])
tensor(9)
tensor(9)


In [43]:
id2token = {id: label for label, id in vocab.get_stoi().items()}
for token in input_ids.numpy():
    print(id2token[token], end= ' ')

what is my name ? <sep> my name is aivn and i am from vietnam . <pad> <pad> <pad> <pad> <pad> <pad> 

### 3. Create dataset

In [44]:

from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, data: list[dict]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        question_text = item['question']
        context_text = item['context']
        answer_text = item['answer']

        input_ids, st_pos, end_pos = vectorize(
            question=question_text,
            context=context_text,
            answer=answer_text
        )
        return input_ids, st_pos, end_pos


In [56]:
from torch.utils.data import DataLoader


train_dataset = QADataset(data=qa_dataset)
train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=1, 
                              shuffle=True)

for batch in train_dataloader:
    input_ids, st_pos, end_pos = batch
    print(input_ids)
    print(st_pos)
    print(end_pos)
    print("=" * 100)

tensor([[11,  5,  6,  9, 14,  8,  4,  6,  9, 14,  5, 35, 27, 17, 30, 22, 26, 28,
          7,  1,  1,  1]])
tensor([11])
tensor([14])
tensor([[11, 12, 10, 16,  8,  4, 10, 12, 16, 23, 33, 21, 34, 37, 31, 36,  7,  1,
          1,  1,  1,  1]])
tensor([9])
tensor([10])
tensor([[11,  5,  6, 15,  8,  4,  6, 15,  5, 19, 13, 10, 12, 24, 39,  7,  1,  1,
          1,  1,  1,  1]])
tensor([9])
tensor([9])
tensor([[11,  5,  6,  9, 18,  8,  4, 10, 29, 32, 13,  6,  9, 20,  5, 40, 38, 25,
          7,  1,  1,  1]])
tensor([9])
tensor([9])


### 4. Model

In [54]:
import torch.nn as nn 

class QAModel(nn.Module):
    def __init__(self, 
                 vocab_size: int, 
                 embed_dim: int, 
                 hidden_size: int, 
                 n_layers: int, 
                 n_classes: int):
        
        super().__init__()
        self.embed_model = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim
        )

        self.model = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True,
            num_layers=n_layers,
        )

        self.st_linear = nn.Linear(in_features=hidden_size*2,
                                    out_features=n_classes)

        self.end_linear = nn.Linear(in_features=hidden_size*2, 
                                    out_features=n_classes)
        
    def forward(self, input_text): 
        # input_text: [N, max_seq_len]
        input_embed = self.embed_model(input_text) # [N, max_seq_len, embed_dim]

        lstm_output, (hidden_lstm, cell_lstm) = self.model(input_embed)
        # lstm_output: [N, max_seq_len, hidden_size * 2]
        # hidden_lstm: [num_layers, N, hidden_size]
        # cell_lstm: [num_layers, N, hidden_size]
        st_logits = self.st_linear(lstm_output).squeeze(-1) # [N, max_seq_len, 1]
        end_logits = self.end_linear(lstm_output).squeeze(-1) # [N, max_seq_len, 1]
        
        return st_logits, end_logits
    


In [55]:
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS  = 2
N_CLASSES = 1

model =QAModel(vocab_size=VOCAB_SIZE, 
               embed_dim=EMBEDDING_DIM,
               hidden_size=HIDDEN_SIZE,
               n_layers=N_LAYERS,
               n_classes=N_CLASSES)

input_text =torch.randint(low=0, high=1, size=(1, MAX_SEQ_LEN))

model.eval()
with torch.no_grad():
    st_logits, end_logits = model(input_text)
    print(st_logits.shape)
    print(end_logits.shape)


torch.Size([1, 22])
torch.Size([1, 22])


### 5. Training

In [57]:
LR = 1e-3
EPOCHS = 20
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

criterion = nn.CrossEntropyLoss()

model.train()
for _ in range(EPOCHS):
    for idx, (input_ids, st_pos, end_pos) in enumerate(train_dataloader):
        optimizer.zero_grad()

        st_pos_logits, end_pos_logits = model(input_ids)
        st_loss = criterion(st_pos_logits, st_pos)
        end_loss = criterion(end_pos_logits, end_pos)
        loss = (st_loss + end_loss) / 2
        
        loss.backward()
        optimizer.step()

        print(loss.item())
        
        

3.082472324371338
3.1034016609191895
3.062800407409668
3.010685920715332
2.948223114013672
2.8977251052856445
2.8654823303222656
2.7436347007751465
2.7099392414093018
2.582576274871826
2.45509672164917
2.4918534755706787
2.2118425369262695
2.0925960540771484
1.881577968597412
1.9772119522094727
1.8046197891235352
1.4406275749206543
1.2628209590911865
1.3313201665878296
1.1346583366394043
1.0563921928405762
0.9186200499534607
1.1796045303344727
0.6512924432754517
0.5703752040863037
0.48697802424430847
0.9063836336135864
0.3084314465522766
0.7481429576873779
0.3281988203525543
0.2846468985080719
0.4591107964515686
0.1299721747636795
0.10012119263410568
0.21611736714839935
0.06005662679672241
0.18729552626609802
0.10631665587425232
0.03636067733168602
0.12047520279884338
0.022033020853996277
0.07097271084785461
0.016704833135008812
0.03928197920322418
0.015969855710864067
0.01482012216001749
0.06606951355934143
0.026252303272485733
0.010089986026287079
0.025559009984135628
0.0058606984093

In [None]:
model.eval()
with torch.no_grad():
    sample = qa_dataset[0]
    context, question, answer = sample.values()
    input_ids = vectorize(question=question, context=context, answer=answer)
    input_ids = input_ids.unsqueeze(0) # add batch dimention

    st_logits, end_logits = model(input_ids)

    offset = len(tokenier(question)) + 2 
    st_pos = torch.argmax(st_pos, dim=1).numpy()[0]
    end_pos = torch.argmax(end_pos, dim=1).numpy()[0]



    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Prediction: {id2label[predictions.numpy()[0]]}")
    