<a href="https://colab.research.google.com/github/SolanaO/Blogs_Content/blob/master/tsdae/Sentence_Embeddings_With_TSDAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TSDAE Embeddings

## Workspace Setup

In [None]:
!pip install sentence-transformers
!pip install datasets
!pip install pylatexenc

In [None]:
# Neccessary imports

import json
import pandas as pd
import math
import os
import gzip
import csv
import random
import time

from pylatexenc.latex2text import LatexNodes2Text

import nltk
nltk.download('punkt')

from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

from sentence_transformers import models, util, evaluation, losses
from sentence_transformers import datasets

import datasets as dts
from datasets import load_dataset

from torch.utils.data import DataLoader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Load and mount the drive helper
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/drive')

# Change the working directory
%cd /content/drive/MyDrive/tsdae

# Create a path variable
file_path = "/content/drive/MyDrive/tsdae"

Mounted at /content/drive
/content/drive/MyDrive/tsdae


## Load and Prepare the Pre-Training arXiv Data

We are going to use part of [Kaggle arXiv Dataset](https://www.kaggle.com/datasets/Cornell-University/arxiv), comprised of more than 1.7M scholarly STEM papers and their metadata from the established electronic preprint online platform [arXiv](https://arxiv.org). After downloading the dataset, we extract the preprints with the chosen topics. The full dataset is quite large and we have to adjust our approach to extract just parts of it.


In [None]:
# Extract the papers with subject "math"
# Skip this step if you are using the math papers datafile

def extract_entries_with_math(filename: str):
    """
    Function to extract those entries that contain the string 'math' in the 'id'.
    """

    # Initialize an empty list to store the extracted entries.
    entries_with_math = []

    with open(filename, 'r') as f:
        for line in f:
            try:
                # Load the JSON object from the line
                data = json.loads(line)

                # Check if the "id" key exists and if it contains "math"
                if "id" in data and "math" in data["id"]:
                    entries_with_math.append(data)

            except json.JSONDecodeError:
                # Print an error message if this line isn't valid JSON
                print(f"Couldn't parse: {line}")

    return entries_with_math

# Snapshot of the arXiv dataset
arxiv_full_dataset = file_path + "/data/arxiv-metadata-oai-snapshot.json"

# Extract the mathematics papers
entries = extract_entries_with_math(arxiv_full_dataset)

# Save the dataset as a JSON object
arxiv_dataset_math = file_path + "/data/arxiv_math_dataset.json"

with open(arxiv_dataset_math, 'w') as fout:
    json.dump(entries, fout)

In [None]:
# Load the json file containing the selected publications
filename = file_path+"/data/arxiv_math_dataset.json"

with open(filename, 'r') as fin:
        dataset = json.load(fin)

In [None]:
# Read the data as a Pandas dataframe
df = pd.DataFrame(dataset)

# take a look at the data
df.head(2)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,math-ph/0001001,Maurice Kibler,Jacob Katriel and Maurice Kibler,Normal Ordering for Deformed Boson Operators a...,"14 pages, Latex file",J. Phys. A: Math. Gen. 25 (1992) 2683-2691,10.1088/0305-4470/25/9/036,,math-ph math.MP quant-ph,,The normal ordering formulae for powers of t...,"[{'version': 'v1', 'created': 'Sun, 2 Jan 2000...",2009-10-31,"[[Katriel, Jacob, ], [Kibler, Maurice, ]]"
1,math-ph/0001002,Spohn,"Markus Kunze, Herbert Spohn",Slow Motion of Charges Interacting Through the...,,,10.1007/s002200000219,,math-ph math.DS math.MP,,We study the Abraham model for $N$ charges i...,"[{'version': 'v1', 'created': 'Mon, 3 Jan 2000...",2009-10-31,"[[Kunze, Markus, ], [Spohn, Herbert, ]]"


In [None]:
# Parse the titles by transforming the LaTeX script

parsed_titles = []

for i,a in df.iterrows():
    """
    Function to replace LaTeX script with ISO code.
    """
    # Parse titles
    try:
        parsed_titles.append(LatexNodes2Text().latex_to_text(a['title']).replace('\n', ' ').strip())
    except:
        parsed_titles.append(a['title'].replace('\n', ' ').strip())

df['parsed_title'] = parsed_titles

In [None]:
# Extract the titles as a list
train_sentences = df.parsed_title.to_list()
# The size of the dataset
len(train_sentences)

55497

In [None]:
# Create the special denoising dataset that adds noise to the data
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

In [None]:
# Print an example
print(train_dataset[2010])

<InputExample> label: 0, texts: equations for the XXZ; On solutions of Bethe equations for the XXZ model


In [None]:
# Batch data loader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True)

## Model & Pre-Training

In [None]:
# Choose a model to build the word embeddings
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)

# Choose the pooling method for the word embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')

# Build the sentence transformer using the two modules
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Use the denoising auto-encoder loss and tie encoder-decoder weights
train_loss = losses.DenoisingAutoEncoderLoss(model,
                                             decoder_name_or_path=model_name,
                                             tie_encoder_decoder=True)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.3.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.4.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.key.weight', 'bert.encoder.layer.9.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.8.crossattention.output.dense.bias', 'bert.encoder.layer.3.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.la

In [None]:
# Start the clock
start_time = time.time()

# Call the fit method to train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True,
    use_amp=True # set to False if GPU does not support FP16 cores
)

# Stop the clock
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time
print(f"Model trained in {elapsed_time:.2f} seconds, on a Google Colab Pro with A100 GPU & High-RAM.")

# Save path of the model
pretrained_model_save_path = 'output/tsdae-bert-uncased-math'
# Save the model locally
model.save(pretrained_model_save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6937 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Model trained in 870.64 seconds, on a Google Colab Pro with A100 GPU & High-RAM.


## Evaluate TSDAE Pre-Trained Model

Perform standard evaluation on benchmark STS datasets. These evaluations however are not using the specific domain we are working with and might not reflect the true performance of the model.

In [None]:
# Save path of the model
pre_model_save_path = 'output/tsdae-bert-uncased-math'
# Load the model if necessary
model = SentenceTransformer(pre_model_save_path)

In [None]:
# Import the STS benchmark dataset from HuggingFace
sts = dts.load_dataset('glue', 'stsb', split='validation')

# Take a peek at the dataset
sts

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [None]:
# Take a peek at one of the entries
sts['idx'][100], sts['sentence1'][100], sts['sentence2'][100], sts['label'][100]

(100,
 'A woman is riding on a horse.',
 'A man is turning over tables in anger.',
 0.0)

In [None]:
# Normalize the [0, 5] range to [0, 1]
sts = sts.map(lambda x: {'label': x['label'] / 5.0})

# Create a list to store the parsed data
samples = []

for sample in sts:
    # Reformat to use InputExample
    samples.append(InputExample(
        texts=[sample['sentence1'], sample['sentence2']],
        label=sample['label']
    ))

# Instantiate the evaluation module
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    samples, write_csv=False
)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# Score estimation of the pre-trained model
ev_tsdae = evaluator(model)
print(f"The score for the TSDAE pre-trained model is: {ev_tsdae}.")

The score for the TSDAE pre-trained model is: 0.7928078382353794.


In [None]:
# Score estimation for the baseline model with CLS pooling
bert = models.Transformer('bert-base-uncased')
pooling = models.Pooling(bert.get_word_embedding_dimension(), 'cls')

bert_model = SentenceTransformer(modules=[bert, pooling])
ev_bert_base_uncased = evaluator(bert_model)
print(f"The score for the baseline model is: {ev_bert_base_uncased}.")

The score for the baseline model is: 0.3173615247822984.


In [None]:
# Score estimation on a full scope advanced pretrained model
all_model = SentenceTransformer('all-mpnet-base-v2')
ev_all_mpnet_base = evaluator(all_model)
print(f"The score for the all-mpnet-base-v2 is: {ev_all_mpnet_base}.")

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

The score for the all-mpnet-base-v2 is: 0.881089906981933.


In [None]:
# Score estimation on a second full scope more advanced pretrained model
# This model was tuned for semantic search
qa_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
ev_multi_qa_mpnet_base = evaluator(qa_model)
print(f"The score for the multi-qa-mpnet-base-dot-v1 model is: {ev_multi_qa_mpnet_base}.")

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

The score for the multi-qa-mpnet-base-dot-v1 model is: 0.7618690286554501.


## Fine-Tuning on AllNLI Dataset

### Download and Prepare the Datasets

This section is based on [sentence-transformers/examples/training/nli/training_nli_v2.py](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli_v2.py).

In [None]:
# Check if dataset exist. If not, download and extract  it
nli_dataset_path = 'data/AllNLI.tsv.gz'
sts_dataset_path = 'data/stsbenchmark.tsv.gz'

if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

  0%|          | 0.00/40.8M [00:00<?, ?B/s]

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [None]:
# Parse the dataset
def add_to_samples(sent1, sent2, label):
    if sent1 not in train_data:
        train_data[sent1] = {'contradiction': set(), 'entailment': set(), 'neutral': set()}
    train_data[sent1][label].add(sent2)

train_data = {}
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'train':
            sent1 = row['sentence1'].strip()
            sent2 = row['sentence2'].strip()

            add_to_samples(sent1, sent2, row['label'])
            add_to_samples(sent2, sent1, row['label'])  # Also add the opposite

train_samples = []
for sent1, others in train_data.items():
    if len(others['entailment']) > 0 and len(others['contradiction']) > 0:
        train_samples.append(InputExample(texts=[sent1, random.choice(list(others['entailment'])), random.choice(list(others['contradiction']))]))
        train_samples.append(InputExample(texts=[random.choice(list(others['entailment'])), sent1, random.choice(list(others['contradiction']))]))


In [None]:
# Determine the size of the finetuning dataset
len(train_samples)

563648

In [None]:
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples,
                                                   batch_size=32) # 128 default, use smaller batch to fit on Colab

### Fine-Tune the TSDAE Pretrained Model

The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MultipleNegativesRankingLoss. Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset

In [None]:
# Set the model parameters
model_name = 'output/tsdae-bert-uncased-math'
train_batch_size = 32 # The larger, the better the results (usually), but requires more GPU memory
max_seq_length = 75
num_epochs = 1


In [None]:
# Load the local model
local_model = SentenceTransformer(model_name)

In [None]:
# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(local_model)

In [None]:

# Read STS benchmark dataset and use it as evaluation set
dev_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'dev':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                                 batch_size=train_batch_size,
                                                                 name='sts-dev')


In [None]:
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up

# Path to save the finetuned model
model_save_path = 'output/finetuned-bert-uncased-math'

# Start the clock
start_time = time.time()

# Train the model
local_model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=int(len(train_dataloader)*0.1),
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          use_amp=True          # Set to True, if your GPU supports FP16 operations
          )

# Stop the clock
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time
print(f"Model finetuned in {elapsed_time:.2f} seconds, on a Google Colab Pro with A100 GPU & High-RAM.")


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/17614 [00:00<?, ?it/s]

Model finetuned in 2383.56 seconds, on a Google Colab Pro with A100 GPU & High-RAM.


In [None]:
# Score estimation of the finetuned model
ev_finetuned = evaluator(local_model)
print(f"The score for the finetuned model is: {ev_finetuned}.")

The score for the finetuned model is: 0.8460969344343326.
