# 0.Install and import necessary libaries

In [None]:
!pip install transformers sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
from datasets import load_dataset
from datasets import Dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# 1.Load and preprocess Dataset

bài này sử dụng bộ dữ liệu natural language inference (NLI), lấy từ HuggingFace.  Bộ dữ liệu gồm 3 cột tên `premise`, `hypothesis` and `label`.

Cột `label` gồm 3 giá trị: **neutral, contradiction** và **entailment** thể hiện nhãn đánh giá mức độ tương tự về ngữ nghĩa giữa 2 câu `premise` và `hypothesis`. Cứ ứng với cùng một câu `premise` sẽ có 3 câu `hypothesis` tương ứng với các `label` khác nhau.

Trong bài này, ta muốn sử dụng kĩ thuật **Contrastive Learning** để finetune mô hình cho task Sentence Embedding.

In [None]:
dataset = load_dataset("snli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

In [None]:
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})


Dataset có cấu trúc tương tự dict (dictionary-like structrue) thường được sử dụng trong Hugging Face, gồm các dataset nhỏ là test, train và validation với số sample dữ liệu tương ứng là 10,000 dòng, 550,152 dòng và 10,000 dòng.

In [None]:
pd.DataFrame(dataset['train'][:9])


Unnamed: 0,premise,hypothesis,label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
3,Children smiling and waving at camera,They are smiling at their parents,1
4,Children smiling and waving at camera,There are children present,0
5,Children smiling and waving at camera,The kids are frowning,2
6,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,2
7,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,0
8,A boy is jumping on skateboard in the middle o...,The boy is wearing safety equipment.,1


Để sử dụng Contrastrive Learning, ta xây dựng lại bộ dữ liệu từ bộ "snli" thành bộ dữ liệu chỉ còn gồm 2 nhãn:

- **Positive**: sử dụng lại các sample có label `entailment` từ bộ snli.
- **Hard negative**: giữ lại `premise`, random choice 1 câu bất kì trong bộ dữ liệu, ghép thành 1 cặp.

In [None]:
# Hàm tạo cặp câu negative
def generate_negative_pairs(df):
  result = []
  for i in range(len(df)//3):
    p = df['premise'][3*i]
    valid_h = df[(df['premise']!=p) | (~df['label'].isin([0,1]))]['hypothesis'].to_list()
    h = random.choice(valid_h)
    result.append({'premise':p, 'hypothesis':h, 'entailment':float(-1)})
  return pd.DataFrame(result)
df=pd.DataFrame(dataset['train'][:9])
generate_negative_pairs(df)


Unnamed: 0,premise,hypothesis,entailment
0,A person on a horse jumps over a broken down a...,The boy skates down the sidewalk.,-1.0
1,Children smiling and waving at camera,The boy does a skateboarding trick.,-1.0
2,A boy is jumping on skateboard in the middle o...,"A person is outdoors, on a horse.",-1.0


In [None]:
df=pd.DataFrame(dataset['train'][:12])
df

Unnamed: 0,premise,hypothesis,label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
3,Children smiling and waving at camera,They are smiling at their parents,1
4,Children smiling and waving at camera,There are children present,0
5,Children smiling and waving at camera,The kids are frowning,2
6,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,2
7,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,0
8,A boy is jumping on skateboard in the middle o...,The boy is wearing safety equipment.,1
9,An older man sits with his orange juice at a s...,An older man drinks his juice as he waits for ...,1


In [None]:
# 1 cách khác tạo hàm tạo cặp câu negative
def generate_negative_pairs(df):
  result = []
  for i in range(len(df)//3):
    p = df['premise'][3*i]
    valid_idx = list(range(3*i))+list(range(3*i+3,len(df)))
    # print(valid_idx)
    h = df['hypothesis'][random.choice(valid_idx)]
    result.append({'premise':p, 'hypothesis':h, 'entailment':float(-1)})
  return pd.DataFrame(result)
df=pd.DataFrame(dataset['train'][:12])
generate_negative_pairs(df)


Unnamed: 0,premise,hypothesis,entailment
0,A person on a horse jumps over a broken down a...,An elderly man sits in a small shop.,-1.0
1,Children smiling and waving at camera,"A person is at a diner, ordering an omelette.",-1.0
2,A boy is jumping on skateboard in the middle o...,They are smiling at their parents,-1.0
3,An older man sits with his orange juice at a s...,There are children present,-1.0


In [None]:
# preprocess dữ liệu
def wrangle(dataset):
    df = pd.DataFrame(dataset)
    # create pos pairs
    df_pos = df[df['label']==1].drop(columns=["label"])
    df_pos['entailment'] = 1.0
    # create neg pairs
    df_neg = generate_negative_pairs(df)
    # final dataset
    df_result = pd.concat([df_pos, df_neg], axis=0, ignore_index=True)
    dataset_dict = df_result.to_dict(orient='list')
    return df_result, Dataset.from_dict(dataset_dict)


In [None]:
df_test, dict_test = wrangle(dataset['test'])

In [None]:
df_val, dict_val = wrangle(dataset['validation'])

In [None]:
# do hạn chế về nguồn lực tính toán, giảm bộ dữ liệu lại thành 200,000 samples thay vì khoảng 500,000 samples
df_train, dict_train = wrangle(dataset['train'][:200000])

In [None]:
df_val

Unnamed: 0,premise,hypothesis,entailment
0,Two women are embracing while holding to go pa...,The sisters are hugging goodbye while holding ...,1.0
1,"Two young children in blue jerseys, one with t...",Two kids at a ballgame wash their hands.,1.0
2,A man selling donuts to a customer during a wo...,A man selling donuts to a customer during a wo...,1.0
3,Two young boys of opposing teams play football...,boys scoring a touchdown,1.0
4,A man in a blue shirt standing in front of a g...,A man is repainting a garage,1.0
...,...,...,...
6563,Two police officers are sitting on motorcycles...,A man is giving a woman cooking lessons.,-1.0
6564,A man is performing an aerial skateboard trick...,A little girl in red plays tennis.,-1.0
6565,A man is performing an aerial skateboard trick...,A person is riding a horse.,-1.0
6566,A uniformed competitor in motocross has crosse...,The girl is outside.,-1.0


In [None]:
# # Do bộ dữ liệu khá nặng, mỗi lần đều chạy lại bước này sẽ chậm, đoạn code dưới đây là để lưu dataset
# # sau khi clean này thành file csv và load lại sử dụng ở những lần sau
# df_train.to_csv('df_train.csv', index=False)
# df_test.to_csv('df_test.csv', index=False)
# df_val.to_csv('df_val.csv', index=False)

# from datasets import Dataset
# df_train = pd.read_csv('df_train.csv')
# dict_train = Dataset.from_dict(df_train)
# df_test = pd.read_csv('df_test.csv')
# dict_test = Dataset.from_dict(df_test)
# df_val = pd.read_csv('df_val.csv')
# dict_val = Dataset.from_dict(df_val)

In [None]:
dict_test[1]

{'premise': 'A woman with a green headscarf, blue shirt and a very big grin.',
 'hypothesis': 'The woman is young.',
 'entailment': 1.0}

# 2.Define the dataset loader class

In [None]:
# instantiate the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df_train[:2]

Unnamed: 0,premise,hypothesis,entailment
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1.0
1,Children smiling and waving at camera,They are smiling at their parents,1.0


In [None]:
dict_train[:2]

{'premise': ['A person on a horse jumps over a broken down airplane.',
  'Children smiling and waving at camera'],
 'hypothesis': ['A person is training his horse for a competition.',
  'They are smiling at their parents'],
 'entailment': [1.0, 1.0]}

In [None]:
len(df_train)

133114

In [None]:
# dataset loader
class SNLIDataset(torch.utils.data.Dataset):
  def __init__(self, dataset):
    self.normalized_similarity_scores = [i['entailment'] for i in dataset]
    self.first_sentences = [i['premise'] for i in dataset] # list các câu premise
    self.second_sentences = [i['hypothesis'] for i in dataset] # list các câu hypothesis
    self.concatenated_sentences = [[str(x), str(y)] for x, y in zip(self.first_sentences, self.second_sentences)] # list các cặp câu [premise, hypothesis]
  def concat_sen(self):
    """
    Returns concatenated sentences (pairs of premise and hypothesis) stored in the dataset.

    Returns:
      list: List of concatenated sentences (pairs of premise and hypothesis).

    Note:
      This method returns a list of concatenated sentences (pairs of premise and hypothesis) stored in the dataset. It can be used to access the concatenated sentences for further processing or analysis.
    """
    return self.concatenated_sentences
  def __len__(self):
    """
    Returns the number of samples in the dataset loader.

    Returns:
      int: The number of samples in the dataset loader.

    Note:
      This method calculates and returns the total number of samples in the dataset loader. It provides the length of the dataset, which is useful for various operations such as iterating over the dataset or determining the size of the dataset.
    """
    return len(self.normalized_similarity_scores)
  def get_batch_labels(self, idx):
    """
    Retrieves the normalized similarity score between the premise and hypothesis sentences at the specified index.

    Parameters:
      idx (int): Index of the sample.

    Returns:
      torch.Tensor: Normalized similarity score (1 or 0) between the premise and hypothesis sentences.

    Note:
      This function returns the normalized similarity score (entailment score) between the premise and hypothesis sentences at the given index `idx`. The score indicates the degree of semantic entailment between the two sentences, with 1 representing full entailment and 0 representing no entailment.
    """
    return torch.tensor(self.normalized_similarity_scores[idx])

  def get_batch_texts(self, idx):
    """tokenize cặp câu đã được concatenated tại chỉ mục [idx]

    Parameters:
      idx (int): Index of the sample.

    Returns:
      dict: Tokenized input texts including input_ids and attention_mask.

    Note:
      This function utilizes the pre-initialized tokenizer to tokenize the concatenated sentences. The tokenization includes padding to a maximum length of 128 tokens and truncation if the input exceeds the maximum length. The returned dictionary contains 'input_ids' and 'attention_mask' for the tokenized input texts.

    """
    return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt") # sử dụng Tokenizer của model bert-base-uncase đã load ở trên

  def __getitem__(self, idx):
    """
    Retrieves tokenized input texts and corresponding labels for the sample at the specified index.

    Parameters:
      idx (int): Index of the sample.

    Returns:
      tuple: A tuple containing tokenized input texts and corresponding labels.

    Note:
      This method retrieves tokenized input texts (premise and hypothesis sentences) and their corresponding labels for the sample at the given index `idx`. It tokenizes the input texts using the `get_batch_texts` method, which applies the configured tokenizer with padding and truncation. Additionally, it retrieves the corresponding label (normalized similarity score) using the `get_batch_labels` method. The returned tuple contains the tokenized input texts and the corresponding label.
    """
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)
    return batch_texts, batch_y


def collate_fn(texts):
  """
  Collates tokenized input texts into a format suitable for model input.

  Parameters:
    texts (list): List of tokenized input texts.

  Returns:
    list: List of dictionaries containing input_ids and attention_mask for each text.

  Note:
    This function is used as a collation function for batching in DataLoader. It takes a list of tokenized input texts and collates them into a format suitable for model input. Specifically, it extracts the 'input_ids' and 'attention_mask' from each tokenized text and organizes them into dictionaries. Each dictionary contains the 'input_ids' and 'attention_mask' for a single text sample. The collated list of dictionaries is then used as input to the model.
  """
  input_ids = texts['input_ids']
  attention_masks = texts['attention_mask']
  features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
  return features

# 3.Define model class

In [None]:
class BertForSNLI(torch.nn.Module):
    # Pytorch module for task sentence pair classification
    def __init__(self):
        super(BertForSNLI, self).__init__()
        #initialize BERT model from Transformer library, max sequence length=128
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        #initialize a mean Pooling layer using Pooling class from sentence_transformers library.
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension()) #self.bert.get_word_embedding_dimension() = 768
        #initialize a SentenceTransformer model by combining the BERT model and the pooling layer
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding'] #output là sentence embedding của 1 câu input
        return output

In [None]:
train_ds.__getitem__(1)[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
m = models.Transformer('bert-base-uncased', max_seq_length=128)
r = m(train_ds.__getitem__(1)[0])
r

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'input_ids': tensor([[  101,  1037,  2711,  2006,  1037,  3586, 14523,  2058,  1037,  3714,
          2091, 13297,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [None]:
r['token_embeddings'].shape

torch.Size([2, 128, 768])

In [None]:
# Instantiate the model and move it to GPU
model = BertForSNLI()
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSNLI(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

In [None]:
# Kiểm tra các layer của model
for name, param in model.named_parameters():
    print(f'{name}: {param.requires_grad}')

bert.auto_model.embeddings.word_embeddings.weight: True
bert.auto_model.embeddings.position_embeddings.weight: True
bert.auto_model.embeddings.token_type_embeddings.weight: True
bert.auto_model.embeddings.LayerNorm.weight: True
bert.auto_model.embeddings.LayerNorm.bias: True
bert.auto_model.encoder.layer.0.attention.self.query.weight: True
bert.auto_model.encoder.layer.0.attention.self.query.bias: True
bert.auto_model.encoder.layer.0.attention.self.key.weight: True
bert.auto_model.encoder.layer.0.attention.self.key.bias: True
bert.auto_model.encoder.layer.0.attention.self.value.weight: True
bert.auto_model.encoder.layer.0.attention.self.value.bias: True
bert.auto_model.encoder.layer.0.attention.output.dense.weight: True
bert.auto_model.encoder.layer.0.attention.output.dense.bias: True
bert.auto_model.encoder.layer.0.attention.output.LayerNorm.weight: True
bert.auto_model.encoder.layer.0.attention.output.LayerNorm.bias: True
bert.auto_model.encoder.layer.0.intermediate.dense.weight: Tru

In [None]:
# unfreeze layer cuối của BERT model
total_layers = len(model.bert.auto_model.encoder.layer)
last_layer_index = total_layers - 1
k = 1  # You can adjust k based on your preference

# Freeze layers up to last_layer_index - k
for layer_index in range(last_layer_index - k + 1):
    for param in model.bert.auto_model.encoder.layer[layer_index].parameters():
        param.requires_grad = False

# Unfreeze the last k layers
for layer_index in range(last_layer_index - k + 1, last_layer_index + 1):
    for param in model.bert.auto_model.encoder.layer[layer_index].parameters():
        param.requires_grad = True


In [None]:
for name, param in model.named_parameters():
    print(f'{name}: {param.requires_grad}')

bert.auto_model.embeddings.word_embeddings.weight: True
bert.auto_model.embeddings.position_embeddings.weight: True
bert.auto_model.embeddings.token_type_embeddings.weight: True
bert.auto_model.embeddings.LayerNorm.weight: True
bert.auto_model.embeddings.LayerNorm.bias: True
bert.auto_model.encoder.layer.0.attention.self.query.weight: False
bert.auto_model.encoder.layer.0.attention.self.query.bias: False
bert.auto_model.encoder.layer.0.attention.self.key.weight: False
bert.auto_model.encoder.layer.0.attention.self.key.bias: False
bert.auto_model.encoder.layer.0.attention.self.value.weight: False
bert.auto_model.encoder.layer.0.attention.self.value.bias: False
bert.auto_model.encoder.layer.0.attention.output.dense.weight: False
bert.auto_model.encoder.layer.0.attention.output.dense.bias: False
bert.auto_model.encoder.layer.0.attention.output.LayerNorm.weight: False
bert.auto_model.encoder.layer.0.attention.output.LayerNorm.bias: False
bert.auto_model.encoder.layer.0.intermediate.dense.w

# 4.Define the cosine similarity loss function

In [None]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs]) #embeddings from the first position of each pair and stack into a tensor
        emb_2 = torch.stack([inp[1] for inp in inputs]) #embeddings from the second position of each pair and stack into a tensor
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2)) #computes the cosine similarity between the pairs of embeddings and applies the transformation function to the cosine similarity scores
        return self.loss_fn(outputs, labels.squeeze())

Thực ra phần code ở trên không đúng tinh thần của Contrastive learning. => cải thiện vào lần code tới

# 5.Prepare the training and validation data split

In [None]:
# from datasets import Dataset
# # for computational reason, just take a sample of 500 training, 50 validation, 100 testing
# def random_sample(dataset, n_samples):
#   sampled_indices = random.sample(range(len(dataset)), n_samples)

#   # Create a new sampled dataset
#   sampled_dataset = {
#     'premise': [dataset['premise'][i] for i in sampled_indices],
#     'hypothesis': [dataset['hypothesis'][i] for i in sampled_indices],
#     'label': [dataset['label'][i] for i in sampled_indices]
#   }
#   return Dataset.from_dict(sampled_dataset)
# train = random_sample(dataset['train'], 1000)
# train

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 1000
})

In [None]:
# val = random_sample(dataset['validation'], 100)
# test = random_sample(dataset['test'], 100)

In [None]:
# try with small size set -> run ok, no bug, no performance
# train_ds = SNLIDataset(Dataset.from_dict(dict_train[:16]))
# val_ds = SNLIDataset(Dataset.from_dict(dict_val[:8]))

# try with bigger dataset
train_ds = SNLIDataset(dict_train)
val_ds = SNLIDataset(dict_val)

# check length of dataset|
train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

133,114 training samples
6,568 validation samples


In [None]:
batch_size = 64

train_dataloader = DataLoader(
            train_ds,  # The training samples.
            num_workers = 12,
            batch_size = batch_size, # Use this batch size.
            shuffle=True # Select samples randomly for each batch
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 12,
            batch_size = batch_size # Use the same batch size
        )



In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)

In [None]:
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train():
  seed_val = 42

  criterion = CosineSimilarityLoss()
  criterion = criterion.to(device)

  random.seed(seed_val)
  torch.manual_seed(seed_val)

  # We'll store a number of quantities such as training and validation loss,
  # validation accuracy, and timings.
  training_stats = []
  total_t0 = time.time()

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      # For each batch of training data...
      for train_data, train_label in tqdm(train_dataloader):

          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)

          train_data = collate_fn(train_data)
          model.zero_grad()

          output = [model(feature) for feature in train_data]

          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()


      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.5f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for val_data, val_label in tqdm(validation_dataloader):

          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)

          val_data = collate_fn(val_data)

          with torch.no_grad():
              output = [model(feature) for feature in val_data]

          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()

      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)

      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.5f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )

  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return model, training_stats

In [None]:
# Launch the training
model, training_stats = train()


Training...


100%|██████████| 2080/2080 [1:21:29<00:00,  2.35s/it]



  Average training loss: 0.69725
  Training epoch took: 1:21:29

Running Validation...


100%|██████████| 103/103 [01:55<00:00,  1.12s/it]


  Validation Loss: 0.66796
  Validation took: 0:01:56

Training...


100%|██████████| 2080/2080 [1:21:45<00:00,  2.36s/it]



  Average training loss: 0.67281
  Training epoch took: 1:21:45

Running Validation...


100%|██████████| 103/103 [01:55<00:00,  1.13s/it]

  Validation Loss: 0.65957
  Validation took: 0:01:56

Training complete!
Total training took 2:47:06 (h:mm:ss)





In [None]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.697253,0.66796,1:21:29,0:01:56
2,0.67281,0.659573,1:21:45,0:01:56


In [None]:
model.eval()

def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  del test_input['token_type_ids']
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()
  if sim >0:
    return 1
  else:
    return -1

In [None]:
PATH = 'bert-sts.pt'
torch.save(model.state_dict(), PATH)

# 6.Test

In [None]:
test_ds = SNLIDataset(dict_test)
test_size = len(test_ds)


print('{:>5,} test samples'.format(test_size))

6,552 test samples


In [None]:
y_true = [int(y) for y in test_ds.normalized_similarity_scores]
len(y_true)

6552

In [None]:
y_pred = []
for pair in test_ds.concat_sen():
  y_pred.append(predict_similarity(pair))
len(y_pred)

6552

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          -1       0.98      0.56      0.71      3333
           1       0.68      0.99      0.81      3219

    accuracy                           0.77      6552
   macro avg       0.83      0.77      0.76      6552
weighted avg       0.83      0.77      0.76      6552



In [None]:
# Thử load lại model nhưng không thành công
from sentence_transformers import SentenceTransformer, models
import torch

class BertForSNLI(torch.nn.Module):
    def __init__(self):
        super(BertForSNLI, self).__init__()
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

# Create an instance of your model
model = BertForSNLI()

# Load the saved model state dictionary
PATH = '/content/bert-sts.pt'
model.load_state_dict(torch.load(PATH))

# Put the model in evaluation mode
model.eval()

RuntimeError: ignored

Lần tới, thử train, load lại model xem sao, rồi thực hiện task classifiation

In [None]:
prediction[0].shape

NameError: ignored

In [None]:
# Prepare the data
first_sent = test_ds.first_sentences
second_sent = test_ds.second_sentences
full_text = test_ds.concatenated_sentences


In [None]:
example_1 = full_text[2]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {predict_similarity(example_1)}")
test[2]

In [None]:
example_1

In [None]:
example_1 = full_text[1]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {predict_similarity(example_1)}")
test[1]