# BERT

In [2]:
# import necessary library
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import wikipediaapi 
import re
import time
import datasets


  from .autonotebook import tqdm as notebook_tqdm


# Task1

## 1. Data

In [3]:
# en for English
wiki = wikipediaapi.Wikipedia('A4 (st124482@ait.asia)','en')
# load Liverpool F.C. content
raw_text = wiki.page('Liverpool F.C.')
raw_text = raw_text.text
# page.text

## 2. Preprocessing

### Tokenization and numericalization

In [4]:
# clean data
raw_text = raw_text.replace('\n\n', ' ')
raw_text = raw_text.replace('\n', ' ')
raw_text = raw_text.replace('.', '. ')
raw_text = raw_text.replace('.  ', '.')

# delete some text that it not be a sentence
# find the index of "Minor titles"
start_index = raw_text.find("Minor titles")

# If "Minor titles" is found, slice the string to remove the content from that point onward
if start_index != -1:
    cleaned_text = raw_text[:start_index]

    # Print the cleaned text
    print(cleaned_text)
else:
    print("String 'Minor titles' not found.")

Liverpool Football Club is a professional football club based in Liverpool, England.The club competes in the Premier League, the top tier of English football.Founded in 1892, the club joined the Football League the following year and has played its home games at Anfield since its formation. Domestically, the club has won nineteen league titles, eight FA Cups, a record ten League Cups, one Football League Super Cup, one Sheriff of London Charity Shield and sixteen FA Community Shields.In international competitions, the club has won six European Cups, three UEFA Cups, four UEFA Super Cups—all English records—and one FIFA Club World Cup.The club established itself as a major force in domestic and European football in the 1970s and 1980s, when Bill Shankly, Bob Paisley, Joe Fagan and Kenny Dalglish, led the club to a combined 11 League titles and four European Cups.Liverpool won two further European Cups in 2005 and 2019 under the management of Rafael Benítez and Jürgen Klopp, respectively

In [5]:
import spacy

# split sentence from raw text
nlp = spacy.load("en_core_web_sm")
doc = nlp(cleaned_text)
sentences = list(doc.sents)

# lower case, and clean all the symbols
text = [x.text.lower() for x in sentences]
# clean the data
text = [re.sub("f. c.", 'fc ', x) for x in text]
text = [re.sub("[,!.?\\-;]", '', x) for x in text]

In [6]:
# making vocab list
word_list = list(set(" ".join(text).split()))
word2id   = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

In [7]:
for i, w in enumerate(word_list):
    word2id[w] = i + 4 #reserve the first 0-3 for CLS, PAD
    id2word    = {i:w for i, w  in enumerate(word2id)}
    vocab_size = len(word2id)

# convert the word to numeric of each sentence
token_list = list()
for sentence in text:
    arr = [word2id[word] for word in sentence.split()]
    token_list.append(arr)

In [8]:
id2word[1270]

'£218'

In [9]:
len(sentences)

213

## 3. Data loader

In [10]:
# assign some hyperparameter
batch_size = 6
max_mask   = 5 
max_len    = 200 

In [11]:
# create batch that has half for positive and another half for negative
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative != batch_size / 2: # if positive is half, negative is also half
        
        #randomly choose two sentence
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b            = token_list[tokens_a_index], token_list[tokens_b_index]
        
        #1. token embedding - add CLS and SEP on starting and ending of sentence respectively
        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]
        
        #2. segment embedding - which sentence is 0 (first sentence) and 1 (second sentence)
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        #3 masking
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], [] #compare the output with masked_tokens
        #simply loop and mask accordingly
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids[pos] = word2id['[MASK]']
            else: 
                pass
            
        #4. pad the sentence to the max length
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        
        #6. check whether is positive or negative
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # True = it is the next sentence
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1
        
    return batch
        

In [12]:
# check the make_batch to ensure that it work correctly
batch = make_batch()

In [13]:
len(batch)

6

In [14]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [15]:
input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext

(torch.Size([6, 200]),
 torch.Size([6, 200]),
 torch.Size([6, 5]),
 torch.Size([6, 5]),
 tensor([0, 0, 0, 1, 1, 1]))

## 4. Model


## 4.1 Embedding



In [16]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        #x, seg: (bs, len)
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

## 4.2 Attention mask

In [17]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

### Testing the attention mask

In [18]:
print(get_attn_pad_mask(input_ids, input_ids).shape)

torch.Size([6, 200, 200])


## 4.3 Encoder

In [19]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn       = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

Let's define the scaled dot attention, to be used inside the multihead attention

In [20]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn 

Let's define the parameters first

In [21]:
n_layers = 6    # number of Encoder of Encoder Layer
n_heads  = 8    # number of heads in Multi-Head Attention
d_model  = 768  # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

Here is the Multiheadattention.

In [22]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]


Here is the PoswiseFeedForwardNet.

In [23]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(F.gelu(self.fc1(x)))


## 4.4 Putting them together

In [24]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        
        # 1. predict next sentence
        # it will be decided by first token(CLS)
        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]

        # 2. predict the masked token
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked  = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_nsp

## 5. Training

In [24]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
num_epoch = 500
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

best_loss = float('inf')

start_time = time.time()
for epoch in range(num_epoch):
    optimizer.zero_grad()
    logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)    

    #1. mlm loss
    #logits_lm.transpose: (bs, vocab_size, max_mask) vs. masked_tokens: (bs, max_mask)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    #2. nsp loss
    #logits_nsp: (bs, 2) vs. isNext: (bs, )
    loss_nsp = criterion(logits_nsp, isNext) # for sentence classification
    
    #3. combine loss
    loss = loss_lm + loss_nsp
    if loss < best_loss:
        best_loss = loss
        torch.save(model.state_dict(), 'models/best-bert-model.pt')

    if epoch % 100 == 0:
        print('Epoch:', '%02d' % (epoch), 'loss =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Time: {epoch_mins}m {epoch_secs}s')

Epoch: 00 loss = 99.312897
Epoch: 100 loss = 5.052397
Epoch: 200 loss = 4.202962
Epoch: 300 loss = 4.552831
Epoch: 400 loss = 3.918600
Time: 12m 21s


## 6. Inference

Since our dataset is very small, it won't work very well, but just for the sake of demonstration.

In [None]:
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[2]))
print([id2word[w.item()] for w in input_ids[0] if id2word[w.item()] != '[PAD]'])

logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)

#predict masked tokens
#max the probability along the vocab dim (2), [1] is the indices of the max, and [0] is the first value
logits_lm = logits_lm.data.max(2)[1][0].data.numpy() 
#note that zero is padding we add to the masked_tokens
print('masked tokens (words) : ',[id2word[pos.item()] for pos in masked_tokens[0]])
print('masked tokens list : ',[pos.item() for pos in masked_tokens[0]])
print('predict masked tokens (words) : ',[id2word[pos.item()] for pos in logits_lm])
print('predict masked tokens list : ', [pos for pos in logits_lm])

#predict nsp
logits_nsp = logits_nsp.data.max(1)[1][0].data.numpy()
print(logits_nsp)
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_nsp else False)

['[CLS]', 'liverpool', 'featured', 'in', 'the', '2001', 'film', 'the', '51st', 'state', '[MASK]', 'which', 'exhitman', 'felix', 'desouza', '(robert', 'carlyle)', 'is', 'a', 'keen', 'supporter', 'of', 'the', 'team', '[MASK]', 'the', 'last', 'scene', 'takes', 'place', 'at', '[MASK]', 'match', 'between', 'liverpool', 'and', 'manchester', 'united', '[SEP]', 'liverpool', 'suffered', 'its', 'second', 'cup', 'final', 'defeat', 'finances', '1950', 'playing', 'against', '[MASK]', '[SEP]']
masked tokens (words) :  ['and', 'in', 'in', 'a', 'arsenal']
masked tokens list :  [669, 775, 775, 908, 1145]
predict masked tokens (words) :  ['[PAD]', '[PAD]', 'of', '[PAD]', 'of']
predict masked tokens list :  [0, 0, 393, 0, 393]
0
isNext :  False
predict isNext :  False


# Task2

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [26]:
# load the MNLI dataset
import datasets

mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features

{'premise': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [27]:
# list of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [28]:
# remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [29]:
# list of datasets to ensure that 'idx' column is removed
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [30]:
# list all label that have in the dataset
np.unique(mnli['train']['label'])

array([0, 1, 2])

In [31]:
# create dataset dictionary with sample data (since my computer cannot run all dataset)
from datasets import DatasetDict

raw_dataset = DatasetDict({
    'train': mnli['train'].shuffle(seed=55).select(list(range(3000))),
    'test': mnli['test_mismatched'].shuffle(seed=55).select(list(range(500))),
    'validation': mnli['validation_mismatched'].shuffle(seed=55).select(list(range(1000)))
})

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

### Preprocessing

In [32]:

def preprocess_function(examples):
    lst_input_ids_premise = []
    lst_input_ids_hypothesis = []
    lst_masked_tokens_premise = []
    lst_masked_pos_premise = []
    lst_masked_tokens_hypothesis = []
    lst_masked_pos_hypothesis = []
    lst_segment_ids = []
    lst_attention_premise=[]
    lst_attention_hypothesis=[]
    labels = examples['label']
    max_seq_length = 200
    seed(55) 
    for i in range(len(examples['premise'])):

        # convert the word to numeric
        tokens_premise, tokens_hypothesis            = [word2id[word] if word in word_list else len(word_list) for word in examples['premise'][i].split()], \
                                                    [word2id[word] if word in word_list else len(word_list) for word in examples['hypothesis'][i].split()]
        
        #1. token embedding - add CLS and SEP on beginning and ending of premise and hypothesis
        input_ids_premise = [word2id['[CLS]']] + tokens_premise + [word2id['[SEP]']]
        input_ids_hypothesis = [word2id['[CLS]']] + tokens_hypothesis + [word2id['[SEP]']]
      
        #2. segment embedding - there one sentence so I decide to segment it as all 0
        segment_ids = [0] * max_seq_length
        #3 masking
        n_pred_premise = min(max_mask, max(1, int(round(len(input_ids_premise) * 0.15))))

        #get all the pos excluding CLS and SEP
        candidates_masked_pos_premise = [i for i, token in enumerate(input_ids_premise) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos_premise)
        masked_tokens_premise, masked_pos_premise = [], [] #compare the output with masked_tokens
        #simply loop and mask accordingly
        for pos in candidates_masked_pos_premise[:n_pred_premise]:
            masked_pos_premise.append(pos)
            masked_tokens_premise.append(input_ids_premise[pos])
           
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids_premise[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids_premise[pos] = word2id['[MASK]']
            else: 
                pass

        n_pred_hypothesis = min(max_mask, max(1, int(round(len(input_ids_hypothesis) * 0.15))))
        #get all the pos excluding CLS and SEP
        candidates_masked_pos_hypothesis = [i for i, token in enumerate(input_ids_hypothesis) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
        shuffle(candidates_masked_pos_hypothesis)
        masked_tokens_hypothesis, masked_pos_hypothesis = [], [] #compare the output with masked_tokens
        #simply loop and mask accordingly
        for pos in candidates_masked_pos_hypothesis[:n_pred_hypothesis]:
            masked_pos_hypothesis.append(pos)
            masked_tokens_hypothesis.append(input_ids_hypothesis[pos])
            if random() < 0.1:  #10% replace with random token
                index = randint(0, vocab_size - 1)
                input_ids_hypothesis[pos] = word2id[id2word[index]]
            elif random() < 0.8:  #80 replace with [MASK]
                input_ids_hypothesis[pos] = word2id['[MASK]']
            else: 
                pass
        
        #4. pad the sentence to the max length
        n_pad_premise = max_seq_length - len(input_ids_premise)
        input_ids_premise.extend([0] * n_pad_premise)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred_premise:
            n_pad_premise = max_mask - n_pred_premise
            masked_tokens_premise.extend([0] * n_pad_premise)
            masked_pos_premise.extend([0] * n_pad_premise)
            attention_premise = [1]*n_pred_premise+[0]*(n_pad_premise)
            
        #4. pad the sentence to the max length
        n_pad_hypothesis = max_seq_length - len(input_ids_hypothesis)
        input_ids_hypothesis.extend([0] * n_pad_hypothesis)
        
        #5. pad the mask tokens to the max length
        if max_mask > n_pred_hypothesis:
            n_pad_hypothesis = max_mask - n_pred_hypothesis
            masked_tokens_hypothesis.extend([0] * n_pad_hypothesis)
            masked_pos_hypothesis.extend([0] * n_pad_hypothesis)
            attention_hypothesis = [1]*n_pred_hypothesis+[0]*(n_pad_hypothesis)
        
        # add the value to own list
        lst_input_ids_premise.append(input_ids_premise)
        lst_input_ids_hypothesis.append(input_ids_hypothesis)
        lst_segment_ids.append(segment_ids)
        lst_masked_tokens_premise.append(masked_tokens_premise)
        lst_masked_pos_premise.append(masked_pos_premise)
        lst_masked_tokens_hypothesis.append(masked_tokens_hypothesis)
        lst_masked_pos_hypothesis.append(masked_pos_hypothesis)
        lst_attention_premise.append(attention_premise)
        lst_attention_hypothesis.append(attention_hypothesis)

    # return as a dictionary
    return {
        "premise_input_ids": lst_input_ids_premise,
        "premise_pos_mask":lst_masked_pos_premise,
        "hypothesis_input_ids": lst_input_ids_hypothesis,
        "hypothesis_pos_mask": lst_masked_pos_hypothesis,
        "segment_ids": lst_segment_ids,
        "attention_premise": lst_attention_premise,
        "attention_hypothesis": lst_attention_hypothesis,
        "labels" : labels,
    }

# map raw dataset with preprocess_function to create new data dict
tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map: 100%|██████████| 3000/3000 [00:01<00:00, 2814.75 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 2819.06 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3128.18 examples/s]


In [33]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_pos_mask', 'hypothesis_input_ids', 'hypothesis_pos_mask', 'segment_ids', 'attention_premise', 'attention_hypothesis', 'labels'],
        num_rows: 1000
    })
})

### Data loader

In [34]:
from torch.utils.data import DataLoader

# create the dataloader
batch_size = 32
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [35]:
# print the shape of each key 
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_pos_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_pos_mask'].shape)
    print(batch['segment_ids'].shape)
    print(batch['attention_premise'].shape)
    print(batch['attention_hypothesis'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 200])
torch.Size([32, 5])
torch.Size([32, 200])
torch.Size([32, 5])
torch.Size([32, 200])
torch.Size([32, 5])
torch.Size([32, 5])
torch.Size([32])


### Model

In [36]:
# load model from task1
model1 = BERT()
model1.load_state_dict(torch.load('models/best-bert-model.pt'))

<All keys matched successfully>

In [39]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

### Loss Function

In [40]:
# the function is for Classification Objective
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

# the function is for Regression Objective
def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [43]:
# classifier_head has shape (vocab_size*3,3)
classifier_head = torch.nn.Linear(1546*3, 3).to(device)

optimizer = torch.optim.Adam(model1.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [44]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



### Training the model1

In [None]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted
start_time = time.time()
for epoch in range(num_epoch):
    model1.train()  
    classifier_head.train()
    best_loss = float('inf')
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        pos_mask_a = batch['premise_pos_mask'].to(device)
        pos_mask_b = batch['hypothesis_pos_mask'].to(device)
        segment_ids = batch['segment_ids'].to(device)
        attention_a = batch['attention_premise'].to(device)
        attention_b = batch['attention_hypothesis'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u, _ = model1(inputs_ids_a, segment_ids, pos_mask_a)  
        v, _ = model1(inputs_ids_b, segment_ids, pos_mask_b)  
    

        u_last_hidden_state = u # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        if loss < best_loss:
            best_loss = loss
            torch.save(model1.state_dict(), 'models/trained-model1.pt')
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Time: {epoch_mins}m {epoch_secs}s')

100%|██████████| 94/94 [15:52<00:00, 10.13s/it]


Epoch: 1 | loss = 3.849288


100%|██████████| 94/94 [16:41<00:00, 10.66s/it]


Epoch: 2 | loss = 3.948837


100%|██████████| 94/94 [16:35<00:00, 10.59s/it]


Epoch: 3 | loss = 2.719974


100%|██████████| 94/94 [16:30<00:00, 10.53s/it]


Epoch: 4 | loss = 2.783654


100%|██████████| 94/94 [16:08<00:00, 10.30s/it]

Epoch: 5 | loss = 2.899947
Time: 81m 48s





# Task 3

In [37]:
# function for calculate the total parameters
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    print(f'______\n{sum(params):>6}')

In [38]:
# create function for compute the loss of model1 from task1 and task2
def calculate_loss_model1(model, classifier, criterion, eval_dataloader):
    model.eval()
    classifier.eval()
    total_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):

            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            pos_mask_a = batch['premise_pos_mask'].to(device)
            pos_mask_b = batch['hypothesis_pos_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)
            attention_b = batch['attention_hypothesis'].to(device)
            label = batch['labels'].to(device)

            # extract token embeddings from BERT at last_hidden_state
            u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)  # all token embeddings A = batch_size, seq_len, hidden_dim
            v, _ = model(inputs_ids_b, segment_ids, pos_mask_b)  # all token embeddings B = batch_size, seq_len, hidden_dim

            # get the mean pooled vectors
            u_mean_pool = mean_pool(u, attention_a) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v, attention_b) # batch_size, hidden_dim

            # build the |u-v| tensor
            uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
            uv_abs = torch.abs(uv) # batch_size,hidden_dim
            
            # concatenate u, v, |u-v|
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
            
            # process concatenated tensor through classifier_head
            x = classifier(x) #batch_size, classifer
            
            # calculate the 'softmax-loss' between predicted and true label
            loss = criterion(x, label)

            total_loss += loss
    
    average_loss = total_loss/len(eval_dataloader)
    print(f"Average Loss: {average_loss:.4f}")

In [39]:
# create function for compute the cosine similarity of model1 from task1 and task2
def calculate_cosine_sim_model1(model, classifier,eval_dataloader):
    model.eval()
    classifier.eval()
    total_similarity = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            # prepare batches and more all to the active device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            pos_mask_a = batch['premise_pos_mask'].to(device)
            pos_mask_b = batch['hypothesis_pos_mask'].to(device)
            segment_ids = batch['segment_ids'].to(device)
            attention_a = batch['attention_premise'].to(device)
            attention_b = batch['attention_hypothesis'].to(device)
            label = batch['labels'].to(device)

            # extract token embeddings from BERT at last_hidden_state

            u, _ = model(inputs_ids_a, segment_ids, pos_mask_a)  
            v, _ = model(inputs_ids_b, segment_ids, pos_mask_b) 
            # get the mean pooled vectors
            u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

            similarity_score = cosine_similarity(u_mean_pool.reshape(1, -1), v_mean_pool.reshape(1, -1))[0, 0]
            total_similarity += similarity_score
        
    average_similarity = total_similarity / len(eval_dataloader)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")

In [40]:
# tokenize the sentence of model 1
def tokenize_sentence_model1(sentence_a, sentence_b):
    lst_input_ids_premise = []
    lst_input_ids_hypothesis = []
    lst_masked_tokens_premise = []
    lst_masked_pos_premise = []
    lst_masked_tokens_hypothesis = []
    lst_masked_pos_hypothesis = []
    lst_segment_ids = []
    lst_attention_premise=[]
    lst_attention_hypothesis=[]
    max_seq_length = 200
    seed(55) 

    tokens_premise, tokens_hypothesis            = [word2id[word] if word in word_list else len(word_list) for word in sentence_a.split()], \
                                                    [word2id[word] if word in word_list else len(word_list) for word in sentence_b.split()]
    
    input_ids_premise = [word2id['[CLS]']] + tokens_premise + [word2id['[SEP]']]
    input_ids_hypothesis = [word2id['[CLS]']] + tokens_hypothesis + [word2id['[SEP]']]
    
    #2. segment embedding 
    segment_ids = [0] * max_seq_length
     #3 masking
    n_pred_premise = min(max_mask, max(1, int(round(len(input_ids_premise) * 0.15))))

    #get all the pos excluding CLS and SEP
    candidates_masked_pos_premise = [i for i, token in enumerate(input_ids_premise) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
    shuffle(candidates_masked_pos_premise)
    masked_tokens_premise, masked_pos_premise = [], [] #compare the output with masked_tokens
    #simply loop and mask accordingly
    for pos in candidates_masked_pos_premise[:n_pred_premise]:
        masked_pos_premise.append(pos)
        masked_tokens_premise.append(input_ids_premise[pos])
           
        if random() < 0.1:  #10% replace with random token
            index = randint(0, vocab_size - 1)
            input_ids_premise[pos] = word2id[id2word[index]]
        elif random() < 0.8:  #80 replace with [MASK]
            input_ids_premise[pos] = word2id['[MASK]']
        else: 
            pass

    n_pred_hypothesis = min(max_mask, max(1, int(round(len(input_ids_hypothesis) * 0.15))))
    #get all the pos excluding CLS and SEP
    candidates_masked_pos_hypothesis = [i for i, token in enumerate(input_ids_hypothesis) if token != word2id['[CLS]'] 
                                 and token != word2id['[SEP]']]
    shuffle(candidates_masked_pos_hypothesis)
    masked_tokens_hypothesis, masked_pos_hypothesis = [], [] #compare the output with masked_tokens
    #simply loop and mask accordingly
    for pos in candidates_masked_pos_hypothesis[:n_pred_hypothesis]:
        masked_pos_hypothesis.append(pos)
        masked_tokens_hypothesis.append(input_ids_hypothesis[pos])
        if random() < 0.1:  #10% replace with random token
            index = randint(0, vocab_size - 1)
            input_ids_hypothesis[pos] = word2id[id2word[index]]
        elif random() < 0.8:  #80 replace with [MASK]
            input_ids_hypothesis[pos] = word2id['[MASK]']
        else: 
            pass

    #4. pad the sentence to the max length
    n_pad_premise = max_seq_length - len(input_ids_premise)
    input_ids_premise.extend([0] * n_pad_premise)
        
    #5. pad the mask tokens to the max length
    if max_mask > n_pred_premise:
        n_pad_premise = max_mask - n_pred_premise
        masked_tokens_premise.extend([0] * n_pad_premise)
        masked_pos_premise.extend([0] * n_pad_premise)
        attention_premise = [1]*n_pred_premise+[0]*(n_pad_premise)
            
    #4. pad the sentence to the max length
    n_pad_hypothesis = max_seq_length - len(input_ids_hypothesis)
    input_ids_hypothesis.extend([0] * n_pad_hypothesis)
        
    #5. pad the mask tokens to the max length
    if max_mask > n_pred_hypothesis:
        n_pad_hypothesis = max_mask - n_pred_hypothesis
        masked_tokens_hypothesis.extend([0] * n_pad_hypothesis)
        masked_pos_hypothesis.extend([0] * n_pad_hypothesis)
        attention_hypothesis = [1]*n_pred_hypothesis+[0]*(n_pad_hypothesis)

    lst_input_ids_premise.append(input_ids_premise)
    lst_input_ids_hypothesis.append(input_ids_hypothesis)
    lst_segment_ids.append(segment_ids)
    lst_masked_tokens_premise.append(masked_tokens_premise)
    lst_masked_pos_premise.append(masked_pos_premise)
    lst_masked_tokens_hypothesis.append(masked_tokens_hypothesis)
    lst_masked_pos_hypothesis.append(masked_pos_hypothesis)
    lst_attention_premise.append(attention_premise)
    lst_attention_hypothesis.append(attention_hypothesis)

    return {
        "premise_input_ids": lst_input_ids_premise,
        "premise_pos_mask":lst_masked_pos_premise,
        "hypothesis_input_ids": lst_input_ids_hypothesis,
        "hypothesis_pos_mask": lst_masked_pos_hypothesis,
        "segment_ids": lst_segment_ids,
        "attention_premise": lst_attention_premise,
        "attention_hypothesis": lst_attention_hypothesis,
        
    }

In [41]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

# create function for compute consine similarity of unseen 2 sentence 
def calculate_similarity_model1(model, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs = tokenize_sentence_model1(sentence_a, sentence_b)
    
    # Move input IDs and attention masks to the active device
    inputs_ids_a = torch.tensor(inputs['premise_input_ids'])
    pos_mask_a = torch.tensor(inputs['premise_pos_mask'])
    attention_a = torch.tensor(inputs['attention_premise'])
    inputs_ids_b = torch.tensor(inputs['hypothesis_input_ids'])
    pos_mask_b = torch.tensor(inputs['hypothesis_pos_mask'])
    attention_b = torch.tensor(inputs['attention_hypothesis'])
    segment = torch.tensor(inputs['segment_ids'])

    # Extract token embeddings from BERT
    u,_ = model(inputs_ids_a, segment, pos_mask_a)  
    v,_ = model(inputs_ids_b, segment, pos_mask_b) 

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score   

## Model 1

### Evaluate model1 before training with MNLI dataset

In [42]:
# load model1 before retrain in task2 
model1 = BERT()
model1.load_state_dict(torch.load('models/best-bert-model.pt'))

<All keys matched successfully>

In [43]:
count_parameters(model1)

______
37951500


In [50]:
calculate_cosine_sim_model1(model1,classifier_head,eval_dataloader)

Average Cosine Similarity: 0.9999


In [51]:
calculate_loss_model1(model1,classifier_head,criterion,eval_dataloader)

Average Loss: 16.4644


In [52]:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity_model1(model1, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9999


### Evaluate model1 after training with MNLI dataset

In [53]:
# Instantiate the BERT model
saved_model1 = BERT()
saved_model1.load_state_dict(torch.load('models/trained-model1.pt'))

<All keys matched successfully>

In [54]:
calculate_cosine_sim_model1(saved_model1,classifier_head,eval_dataloader)

Average Cosine Similarity: 0.9999


In [55]:
calculate_loss_model1(saved_model1,classifier_head,criterion,eval_dataloader)

Average Loss: 16.4727


In [56]:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity_model1(saved_model1, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 1.0000


## Create function to evaluate model2 and model3 with validation dataset

In [154]:
# create function to compute cosine similarity of model2 and model3
def calculate_cosine_sim2(model, classifier_head, eval_dataloader):
    model.eval()
    classifier_head.eval()
    total_similarity = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            # prepare batches and more all to the active device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            attention_a = batch['premise_attention_mask'].to(device)
            attention_b = batch['hypothesis_attention_mask'].to(device)
            label = batch['labels'].to(device)
            
            # extract token embeddings from BERT at last_hidden_state
            u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
            v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

            # get the mean pooled vectors
            u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

            similarity_score = cosine_similarity(u_mean_pool.reshape(1, -1), v_mean_pool.reshape(1, -1))[0, 0]
            total_similarity += similarity_score
        
    average_similarity = total_similarity / len(eval_dataloader)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")

In [143]:
# create function to compute loss of model2 and model3
def calculate_loss2(model, classifier_head,criterion, eval_dataloader):
    model.eval()
    classifier_head.eval()
    total_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            # prepare batches and more all to the active device
            inputs_ids_a = batch['premise_input_ids'].to(device)
            inputs_ids_b = batch['hypothesis_input_ids'].to(device)
            attention_a = batch['premise_attention_mask'].to(device)
            attention_b = batch['hypothesis_attention_mask'].to(device)
            label = batch['labels'].to(device)
        
            # extract token embeddings from BERT at last_hidden_state
            u = model(inputs_ids_a, attention_mask=attention_a)  # all token embeddings A = batch_size, seq_len, hidden_dim
            v = model(inputs_ids_b, attention_mask=attention_b) # all token embeddings B = batch_size, seq_len, hidden_dim 

            u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
            v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

            # get the mean pooled vectors
            u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
            v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
            # build the |u-v| tensor
            uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
            uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
            # concatenate u, v, |u-v|
            x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
            # process concatenated tensor through classifier_head
            x = classifier_head(x) #batch_size, classifer
        
            # calculate the 'softmax-loss' between predicted and true label
            loss = criterion(x, label)
            total_loss += loss
            
    average_loss = total_loss / len(eval_dataloader)
    print(f"Average Loss: {average_loss:.4f}")

In [144]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

# create function to compute cosine similarity on 2 unseen sentence on model2 and model3
def calculate_similarity2(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = tokenizer(sentence_a, return_tensors='pt', truncation=True, padding=True).to(device)
    inputs_b = tokenizer(sentence_b, return_tensors='pt', truncation=True, padding=True).to(device)

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids']
    attention_a = inputs_a['attention_mask']
    inputs_ids_b = inputs_b['input_ids']
    attention_b = inputs_b['attention_mask']

    # Extract token embeddings from BERT
    u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
    v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

## Model 2

### Preprocessing

In [45]:
# load the pretain tokenizer
from transformers import BertTokenizer

tokenizer_model2 = BertTokenizer.from_pretrained('bert-base-uncased')

In [46]:
# tokenize the data
def preprocess_function2(examples):
    max_seq_length = 128
    padding = 'max_length'
    # Tokenize the premise
    premise_result = tokenizer_model2(
        examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer_model2(
        examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets2 = raw_dataset.map(
    preprocess_function2,
    batched=True,
)

tokenized_datasets2 = tokenized_datasets2.remove_columns(['premise','hypothesis','label'])
tokenized_datasets2.set_format("torch")

In [47]:
tokenized_datasets2

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

### Data loader

In [48]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 32
train_dataloader2 = DataLoader(
    tokenized_datasets2['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader2 = DataLoader(
    tokenized_datasets2['validation'], 
    batch_size=batch_size
)
test_dataloader2 = DataLoader(
    tokenized_datasets2['test'], 
    batch_size=batch_size
)

In [49]:
for batch in train_dataloader2:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32])


### Model

In [50]:
# start from a pretrained bert-base-uncased model
from transformers import BertTokenizer, BertModel
model2 = BertModel.from_pretrained('bert-base-uncased')
model2.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [51]:
count_parameters(model2)

______
109482240


In [151]:
classifier_head2 = torch.nn.Linear(768*3, 3).to(device)

optimizer2 = torch.optim.Adam(model2.parameters(), lr=2e-5)
optimizer_classifier2 = torch.optim.Adam(classifier_head2.parameters(), lr=2e-5)

criterion2 = nn.CrossEntropyLoss()

In [152]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer2, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier2, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



### Evaluate model2 before training with MNLI dataset

In [155]:
calculate_cosine_sim2(model2,classifier_head2,eval_dataloader2)

Average Cosine Similarity: 0.7733


In [122]:
calculate_loss2(model2,classifier_head2,criterion2,eval_dataloader2)

Average Loss: 1.1354


In [123]:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity2(model2, tokenizer_model2, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.8057


### Training the model2 with MNLI dataset

In [None]:
from tqdm.auto import tqdm

num_epoch = 5
start_time = time.time()
best_loss = float('inf')
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model2.train()  
    classifier_head2.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader2, leave=True)):
        # zero all gradients on each new step
        optimizer2.zero_grad()
        optimizer_classifier2.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model2(inputs_ids_a, attention_mask=attention_a)  
        v = model2(inputs_ids_b, attention_mask=attention_b)  

        
        u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head2(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion2(x, label)

        if loss < best_loss:
            best_loss = loss
            torch.save(model2.state_dict(), 'models/trained-model2.pt')
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer2.step()
        optimizer_classifier2.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Time: {epoch_mins}m {epoch_secs}s')

100%|██████████| 94/94 [18:33<00:00, 11.85s/it]


Epoch: 1 | loss = 1.037066


100%|██████████| 94/94 [19:05<00:00, 12.18s/it]


Epoch: 2 | loss = 0.674230


100%|██████████| 94/94 [18:03<00:00, 11.53s/it]


Epoch: 3 | loss = 0.533247


100%|██████████| 94/94 [18:08<00:00, 11.58s/it]


Epoch: 4 | loss = 0.268627


100%|██████████| 94/94 [18:03<00:00, 11.53s/it]

Epoch: 5 | loss = 0.085280
Time: 91m 54s





### Evaluate model2 after training with MNLI dataset

In [None]:
saved_model2 = BertModel.from_pretrained('bert-base-uncased')

# Load the state dictionary of your trained model into the new model
saved_model2.load_state_dict(torch.load('models/trained-model2.pt'))

<All keys matched successfully>

In [156]:
calculate_cosine_sim2(saved_model2,classifier_head2,eval_dataloader2)

Average Cosine Similarity: 0.4133


In [127]:
calculate_loss2(saved_model2,classifier_head2,criterion2,eval_dataloader2)

Average Loss: 1.1322


In [128]:
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity2(saved_model2, tokenizer_model2, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: -0.0900


## Model 3

### Preprocessing

In [52]:
# load the pretain tokenizer
from transformers import RobertaTokenizer, RobertaModel

tokenizer_model3 = RobertaTokenizer.from_pretrained('roberta-base')

In [53]:
# tokenize the data
def preprocess_function3(examples):
    max_seq_length = 128
    padding = 'max_length'
    # Tokenize the premise
    premise_result = tokenizer_model3(
        examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer_model3(
        examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets3 = raw_dataset.map(
    preprocess_function3,
    batched=True,
)

tokenized_datasets3 = tokenized_datasets3.remove_columns(['premise','hypothesis','label'])
tokenized_datasets3.set_format("torch")

In [54]:
tokenized_datasets3

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

### Data loader

In [55]:
from torch.utils.data import DataLoader

# create dataloader
batch_size = 32
train_dataloader3 = DataLoader(
    tokenized_datasets3['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader3 = DataLoader(
    tokenized_datasets3['validation'], 
    batch_size=batch_size
)
test_dataloader3 = DataLoader(
    tokenized_datasets3['test'], 
    batch_size=batch_size
)

In [56]:
for batch in train_dataloader3:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32])


### Model

In [57]:
# load the roberta pretrained model
model3 = RobertaModel.from_pretrained('roberta-base')
model3.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [58]:
count_parameters(model3)

______
124645632


In [163]:
classifier_head3 = torch.nn.Linear(768*3, 3).to(device)

optimizer3 = torch.optim.Adam(model3.parameters(), lr=2e-5)
optimizer_classifier3 = torch.optim.Adam(classifier_head3.parameters(), lr=2e-5)

criterion3 = nn.CrossEntropyLoss()

In [164]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



### Evaluate model3 before training with MNLI dataset

In [165]:
calculate_cosine_sim2(model3,classifier_head3,eval_dataloader3)

Average Cosine Similarity: 0.9767


In [166]:
calculate_loss2(model3,classifier_head3,criterion3,eval_dataloader2)

Average Loss: 1.1290


In [167]:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity2(model3, tokenizer_model3, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9762


### Training model3 with MNLI dataset

In [None]:
from tqdm.auto import tqdm

num_epoch = 5
start_time = time.time()
best_loss = float('inf')
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model3.train()  
    classifier_head3.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader3, leave=True)):
        # zero all gradients on each new step
        optimizer3.zero_grad()
        optimizer_classifier3.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model3(inputs_ids_a, attention_mask=attention_a)  
        v = model3(inputs_ids_b, attention_mask=attention_b)  

        
        u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head3(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion3(x, label)

        if loss < best_loss:
            best_loss = loss
            torch.save(model3.state_dict(), 'models/trained-model3.pt')
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer3.step()
        optimizer_classifier3.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Time: {epoch_mins}m {epoch_secs}s')

100%|██████████| 94/94 [18:21<00:00, 11.72s/it]


Epoch: 1 | loss = 1.027755


100%|██████████| 94/94 [18:03<00:00, 11.53s/it]


Epoch: 2 | loss = 0.798118


100%|██████████| 94/94 [18:03<00:00, 11.52s/it]


Epoch: 3 | loss = 0.935967


100%|██████████| 94/94 [18:01<00:00, 11.51s/it]


Epoch: 4 | loss = 0.643016


100%|██████████| 94/94 [18:04<00:00, 11.54s/it]

Epoch: 5 | loss = 0.119918
Time: 90m 35s





### Evaluate model3 after training with MNLI dataset

In [168]:
# Instantiate a new BERT model
saved_model3 = RobertaModel.from_pretrained('roberta-base')

# Load the state dictionary of your trained model into the new model
saved_model3.load_state_dict(torch.load('models/trained-model3.pt'))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [169]:
calculate_cosine_sim2(saved_model3,classifier_head3,eval_dataloader3)

Average Cosine Similarity: 0.4778


In [170]:
calculate_loss2(saved_model3,classifier_head3,criterion3,eval_dataloader2)

Average Loss: 1.2201


In [171]:
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity2(saved_model3, tokenizer_model3, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: -0.0682


| Model | trainable parameter | Average Cosine Similarity (before)| Average Loss (before)| Cosine Similarity with one specific pair sentence (before) | Average Cosine Similarity (after)| Average Loss (after)| Cosine Similarity with one specific pair sentence (after) | Training Time (train with MNLI dataset)
|:------------------------|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|
| Model1 (my model) |    37,951,500   |    0.9999     |  16.4644 | 0.9999 |  0.9999 | 16.4727 | 1.0000 | 81m 48s|
| Model2 (bert-base-uncased) |   109,482,240    |    0.7733 |  1.1354 | 0.8057 | 0.4133 | 1.1322 | -0.0900 | 91m 54s|  
| Model3 (roberta-base)     |      124,645,632  |    0.9762    |  1.1290  |   0.9762    | 0.4778 | 1.2201 | -0.0682 | 90m 35s|

From result, you can see that when model train with 3,000 sample MNLI dataset, the loss of model is higher expert model2 that increase a lillle. I observed that model with high number trainable parameter is better than one with small number of trainable parameter in loss.  


For hyperparameter, I think for model1, I can increase number of Encoder of Encoder Layer and number of heads in Multi-Head Attention to make model more complex and learn better.

For limitation, my computer cannot train with huge dataset/text (). Moreover, I have not enough time to train more epoch (my GPU is not work). it may cause my model to look so bad. In the future, I can improve this 3 model by training more epoch and train with huge dataset.