![pipeline](image.png)


combine embedding của question và context -> model -> phân loại trong các câu trả lời 

### 1. Data

In [1]:
qa_dataset = [
    {
        'context': 'My name is AIVN and I am from Vietnam.',
        'question': 'What is my name?',
        'answer': 'AIVN'
    },
    {
        'context': 'I love painting and my favorite artist is Vincent Van Gogh.',
        'question': 'What is my favorite activity?',
        'answer': 'painting'
    },
    {
        'context': 'I am studying computer science at the University of Tokyo.',
        'question': 'What am I studying?',
        'answer': 'computer science'
    }
]

### 2. Vectorization

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


tokenier = get_tokenizer("basic_english")

def yield_tokens(examples: list):
    for item in examples:
        yield tokenier(item["context"] + " " + item["question"])


vocab = build_vocab_from_iterator(
    iterator=yield_tokens(qa_dataset), 
    specials=["<unk>", "<pad>", "<bos>", "<eos>", "<sep>"]
)
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()



{'vincent': 32,
 'vietnam': 31,
 'van': 30,
 'university': 29,
 'what': 11,
 '<sep>': 4,
 '<bos>': 2,
 'of': 24,
 'am': 10,
 'my': 7,
 'is': 6,
 'at': 19,
 '.': 8,
 'gogh': 22,
 '<eos>': 3,
 '<pad>': 1,
 'computer': 20,
 'painting': 25,
 'and': 12,
 '<unk>': 0,
 'artist': 18,
 'favorite': 13,
 'studying': 15,
 'i': 5,
 'aivn': 17,
 'tokyo': 28,
 'name': 14,
 'activity': 16,
 'from': 21,
 'love': 23,
 '?': 9,
 'science': 26,
 'the': 27}

In [3]:
classes = set([items["answer"] for items in qa_dataset])
id2label = {idx : label for idx, label in enumerate(classes)}
label2id = {label: id for id, label in id2label.items()}
id2label, label2id

({0: 'painting', 1: 'AIVN', 2: 'computer science'},
 {'painting': 0, 'AIVN': 1, 'computer science': 2})

In [4]:
PAD_IDX = vocab["<pad>"]

def pad_and_truncate(input_ids: list[int], max_seq_len: int):
    if len(input_ids) > max_seq_len:
        input_ids = input_ids[:max_seq_len]
    else:
        input_ids = input_ids + [PAD_IDX] * (max_seq_len - len(input_ids))
    return input_ids
MAX_SEQ_LEN = 5
text = "I love AIVN"
tokenized_text = tokenier(text)
tokens = [vocab[token] for token in tokenized_text]
print(tokens)
tokens = pad_and_truncate(input_ids=tokens, max_seq_len=MAX_SEQ_LEN)
print(tokens)

[5, 23, 17]
[5, 23, 17, 1, 1]


In [5]:
import torch

MAX_SEQ_LEN = 10
MAX_CONTEXT_LEN = 15


def vectorize(question: str, context: str):
    input_ques_ids = [vocab[token] for token in tokenier(question)]
    input_context_ids = [vocab[token] for token in tokenier(context)]

    input_ques_ids = pad_and_truncate(input_ids=input_ques_ids, max_seq_len=MAX_SEQ_LEN)
    input_context_ids = pad_and_truncate(input_ids=input_context_ids, max_seq_len=MAX_CONTEXT_LEN)

    return (
        torch.tensor(input_ques_ids, dtype=torch.long), 
        torch.tensor(input_context_ids, dtype=torch.long)
    )
input_ques_ids, input_context_ids = vectorize(
    question=qa_dataset[0]['question'],
    context=qa_dataset[0]['context']
)

print(input_ques_ids)
print(input_context_ids)

tensor([11,  6,  7, 14,  9,  1,  1,  1,  1,  1])
tensor([ 7, 14,  6, 17, 12,  5, 10, 21, 31,  8,  1,  1,  1,  1,  1])


### 3. Create datasets

In [6]:

from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, data: list[dict]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        question_text = item['question']
        context_text = item['context']

        input_ques_ids, input_context_ids = vectorize(
            question=question_text,
            context=context_text
        )

        answer_text = item['answer']
        answer_id =torch.tensor(label2id[answer_text], dtype=torch.long)

        return input_ques_ids, input_context_ids, answer_id

In [7]:
from torch.utils.data import DataLoader


train_dataset = QADataset(data=qa_dataset)
train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=1, 
                              shuffle=True)

for batch in train_dataloader:
    input_ques_ids, input_context_ids, answer_id = batch
    print(input_ques_ids)
    print(input_context_ids)
    print(answer_id)
    print("=" * 100)

tensor([[11, 10,  5, 15,  9,  1,  1,  1,  1,  1]])
tensor([[ 5, 10, 15, 20, 26, 19, 27, 29, 24, 28,  8,  1,  1,  1,  1]])
tensor([2])
tensor([[11,  6,  7, 13, 16,  9,  1,  1,  1,  1]])
tensor([[ 5, 23, 25, 12,  7, 13, 18,  6, 32, 30, 22,  8,  1,  1,  1]])
tensor([0])
tensor([[11,  6,  7, 14,  9,  1,  1,  1,  1,  1]])
tensor([[ 7, 14,  6, 17, 12,  5, 10, 21, 31,  8,  1,  1,  1,  1,  1]])
tensor([1])


### 4. Model

In [57]:
import torch.nn as nn 

class QAModel(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_size: int, n_layers: int, n_classes: int):
        super().__init__()
        self.embed_model = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim
        )

        self.model = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True,
            num_layers=n_layers,
        )

        self.classifier = nn.Linear(in_features=hidden_size*2,
                                    out_features=n_classes)
    
    def forward(self, question, context): 
        # question: [N, seq_ques], context: [N, seq_context]
        question_embed = self.embed_model(question) # [N, seq_ques, embed_dim]
        context_embed = self.embed_model(context) # [N, seq_context, embed_dim]

        combined = torch.cat(
            tensors=(question_embed, context_embed), dim=1
        ) # [N, seq_ques + seq_context, embed_dim]
        lstm_output, (hidden_lstm, cell_lstm) = self.model(combined)
        # lstm_output: [N, seq_ques + seq_context, hidden_size * 2]
        # hidden_lstm: [num_layers, N, hidden_size]
        # cell_lstm: [num_layers, N, hidden_size]
        lstm_output = lstm_output[:, -1, :]
        
        out = self.classifier(lstm_output)
        return out
    


In [58]:
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS  = 2
N_CLASSES = len(classes)

model =QAModel(vocab_size=VOCAB_SIZE, 
               embed_dim=EMBEDDING_DIM,
               hidden_size=HIDDEN_SIZE,
               n_layers=N_LAYERS,
               n_classes= N_CLASSES)

input_ques =torch.randint(low=0, high=1, size=(1, MAX_SEQ_LEN))
input_context = torch.randint(low=0, high=1, size=(1, MAX_CONTEXT_LEN))

model.eval()
with torch.no_grad():
    logits = model(input_ques, input_context)
    print(logits.shape)

torch.Size([1, 3])


### 5. Training model

In [59]:
LR = 1e-3
EPOCHS = 20
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

criterion = nn.CrossEntropyLoss()

model.train()
for _ in range(EPOCHS):
    for idx, (input_ques_ids, input_context_ids, answer_id) in enumerate(train_dataloader):
        optimizer.zero_grad()

        outputs= model(input_ques_ids, input_context_ids)
        loss = criterion(outputs, answer_id)
        loss.backward()
        optimizer.step()

        print(loss.item())
        
        

1.14091157913208
1.1665637493133545
1.238621711730957
0.9888670444488525
1.2195442914962769
1.1216247081756592
1.1719011068344116
1.1135239601135254
1.0356508493423462
1.0421650409698486
1.0850640535354614
1.1281003952026367
1.0375242233276367
1.054024577140808
1.1255751848220825
1.021980881690979
1.0402880907058716
1.1116539239883423
1.0889383554458618
0.9556953310966492
1.050256371498108
0.9735899567604065
0.8948920369148254
1.044875144958496
1.028849482536316
0.7558454275131226
0.7407063245773315
0.9343756437301636
0.6113422513008118
0.4608971178531647
0.8252317309379578
0.30520883202552795
0.3908999264240265
0.3339079022407532
0.142010897397995
0.5078392624855042
0.40806397795677185
0.14799711108207703
0.052975017577409744
0.04200468957424164
0.08921055495738983
0.2290087640285492
0.04931109771132469
0.018480265513062477
0.07119274139404297
0.054729729890823364
0.0223334189504385
0.009882924146950245
0.027761150151491165
0.007564708590507507
0.01261010579764843
0.01116174552589655


In [60]:
model.eval()
with torch.no_grad():
    sample = qa_dataset[0]
    context, question, answer = sample.values()
    question_ids, context_ids = vectorize(question=question, context=context)
    question_ids = question_ids.unsqueeze(0) # add batch dimention
    context_ids = context_ids.unsqueeze(0)

    outputs = model(question_ids, context_ids)

    _, predictions = torch.max(outputs.data, 1)
    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Prediction: {id2label[predictions.numpy()[0]]}")
    

Context: My name is AIVN and I am from Vietnam.
Question: What is my name?
Prediction: AIVN
