# Structure


## Setup

In [4]:
# !pip install -r requirements.txt
!pip install sentence-transformers
!pip install mteb
!pip install beir
!pip install datasets

Collecting beir
  Using cached beir-2.0.0-py3-none-any.whl
Collecting pytrec-eval (from beir)
  Using cached pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting faiss-cpu (from beir)
  Using cached faiss_cpu-1.8.0.post1-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Using cached elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Using cached elasticsearch-7.9.1-py2.py3-none-any.whl (219 kB)
Using cached faiss_cpu-1.8.0.post1-cp312-cp312-win_amd64.whl (14.6 MB)
Building wheels for collected packages: pytrec-eval
  Building wheel for pytrec-eval (setup.py): started
  Building wheel for pytrec-eval (setup.py): finished with status 'error'
  Running setup.py clean for pytrec-eval
Failed to build pytrec-eval


  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [26 lines of output]
      Fetching trec_eval from https://github.com/usnistgov/trec_eval/archive/v9.0.8.tar.gz.
      !!
      
              ********************************************************************************
              Usage of dash-separated 'description-file' will not be supported in future
              versions. Please use the underscore name 'description_file' instead.
      
              By 2024-Sep-26, you need to update your project and remove deprecated calls
              or your builds will no longer be supported.
      
              See https://setuptools.pypa.io/en/latest/userguide/declarative_config.html for details.
              ********************************************************************************
      
      !!
        opt = self.warn_dash_deprecation(opt, section)
      running bdist_wheel
      running build
      



## Matryoshka-Adaptor Implementation

### Architecture

In [13]:
import torch
import torch.nn.functional as F

# Define MatryoshkaAdaptor module - a simple MLP with skip connection
class MatryoshkaAdaptor(torch.nn.Module):
    """
    A PyTorch neural network module that adapts the output of an embedding model
    into a desired output dimension using two linear transformations with a ReLU activation in between.
    Includes a skip connection from input to output.
    """
    def __init__(self, input_output_dim, hidden_dim):
        """
        Initializes the MatryoshkaAdaptor module.
        
        Args:
            input_output_dim: An integer representing the input and output dimension of the module which are equal.
            hidden_dim: An integer representing the hidden dimension of the module.
            
        Returns:
            None
        """
        super(MatryoshkaAdaptor, self).__init__()
        # First linear layer to transform the input dimension to a hidden dimension
        self.linear1 = torch.nn.Linear(input_output_dim, hidden_dim)
        # Second linear layer to transform the hidden dimension to the output dimension which is same as input dimension
        self.linear2 = torch.nn.Linear(hidden_dim, input_output_dim)
        # Activation function to introduce non-linearity
        self.activation = torch.nn.ReLU()

    def forward(self, embedding):
        """
        Forward pass of the MatryoshkaAdaptor module.

        Args:
            embedding: A torch.Tensor of shape (batch_size, input_output_dim) representing the input embeddings.

        Returns:
            output: A torch.Tensor of shape (batch_size, input_output_dim) representing the matryoshka embeddings.
        """
        # Apply the first linear transformation followed by the activation function
        hidden_embedding = self.activation(self.linear1(embedding))
        
        # Apply the second linear transformation to get the final adapted embedding
        adapted_embedding = self.linear2(hidden_embedding)
        
        # Add the skip connection by adding the original embedding to the adapted embedding
        mat_embedding = adapted_embedding + embedding

        return mat_embedding

### Loss Functions

In [14]:
# Equation 1 in paper
def pairwise_similarity_loss(original_embeddings, matryoshka_embeddings):
    """
    Computes the pairwise similarity loss between original embeddings and matryoshka embeddings.
    
    Args:
        original_embeddings: A tensor of shape (batch_size, embedding_dim) representing the original embeddings.
        matryoshka_embeddings: A tensor of shape (batch_size, mat_embedding_dim) representing the matryoshka embeddings.
        
    Returns:
        loss: A scalar tensor representing the mean pairwise similarity loss.
    """
    
    # Normalize the embeddings along the embedding dimension to get the cosine similarity
    normalized_original_embeddings = F.normalize(original_embeddings, p=2, dim=1)
    normalized_matryoshka_embeddings = F.normalize(matryoshka_embeddings, p=2, dim=1)
    
    # Compute the cosine similarity matrices
    original_similarity_matrix = torch.matmul(normalized_original_embeddings, normalized_original_embeddings.T)
    matryoshka_similarity_matrix = torch.matmul(normalized_matryoshka_embeddings, normalized_matryoshka_embeddings.T)
    
    # Get the indices of the upper triangle of the matrices, excluding the diagonal
    batch_size = original_embeddings.size(0)
    i, j = torch.triu_indices(batch_size, batch_size, offset=1)
    
    # Compute the pairwise cosine similarities
    original_pairwise_similarities = original_similarity_matrix[i, j]
    matryoshka_pairwise_similarities = matryoshka_similarity_matrix[i, j]
    
    # Compute the absolute difference between corresponding pairwise similarities
    similarity_differences = torch.abs(original_pairwise_similarities - matryoshka_pairwise_similarities)
    
    # Sum up all the absolute differences to produce the final loss
    loss = torch.sum(similarity_differences)
    
    return loss

# Equation 2 in paper
def topk_similarity_loss(original_embeddings, matryoshka_embeddings, k=5):
    """
    Computes the top-k similarity loss between original embeddings and matryoshka embeddings.
    
    Args:
        original_embeddings: A tensor of shape (batch_size, embedding_dim) representing the original embeddings.
        matryoshka_embeddings: A tensor of shape (batch_size, mat_embedding_dim) representing the matryoshka embeddings.
        k: The number of top similarities to consider (default is 5).
        
    Returns:
        loss: A scalar tensor representing the top-k similarity loss.
    """
    
    # Normalize the embeddings along the embedding dimension to get the cosine similarity
    normalized_original_embeddings = F.normalize(original_embeddings, p=2, dim=1)
    normalized_matryoshka_embeddings = F.normalize(matryoshka_embeddings, p=2, dim=1)
    
    # Compute the cosine similarity matrices
    original_similarity_matrix = torch.matmul(normalized_original_embeddings, normalized_original_embeddings.T)
    matryoshka_similarity_matrix = torch.matmul(normalized_matryoshka_embeddings, normalized_matryoshka_embeddings.T)
    
    # Exclude self-similarity by setting the diagonal to a very low value
    batch_size = original_embeddings.size(0)
    original_similarity_matrix.fill_diagonal_(-float('inf'))
    matryoshka_similarity_matrix.fill_diagonal_(-float('inf'))
    
    # For each embedding, get the top-k similarities and their corresponding indices
    original_topk_values, _ = torch.topk(original_similarity_matrix, k, dim=1)
    matryoshka_topk_values, _ = torch.topk(matryoshka_similarity_matrix, k, dim=1)
    
    # Compute the absolute difference between the top-k similarities
    similarity_differences = torch.abs(original_topk_values - matryoshka_topk_values)
    
    # Sum up all the absolute differences to produce the final loss
    loss = torch.sum(similarity_differences)
    
    return loss


# Equation 3 in paper
def regularization_loss(original_embeddings, matryoshka_embeddings, alpha=1.0):
    """
    Computes the regularization loss to ensure the matryoshka embeddings do not deviate
    significantly from the original embeddings.
    
    Args:
        original_embeddings: A tensor of shape (batch_size, embedding_dim) representing the original embeddings.
        matryoshka_embeddings: A tensor of shape (batch_size, embedding_dim) representing the matryoshka embeddings.
        alpha: A regularization coefficient that controls the weight of the regularization term.
        
    Returns:
        loss: A scalar tensor representing the regularization loss.
    """
    # Compute the difference between original and matryoshka embeddings
    diff = original_embeddings - matryoshka_embeddings
    
    # Compute the L2 norm of the difference
    loss = torch.norm(diff, p=2, dim=1)
    
    # Return the mean loss over the batch, scaled by alpha
    return alpha * loss.mean()


# Equation 4 in paper
def unsupervised_objective_fn_loss(original_embeddings, matryoshka_embeddings, 
                                   k=5, alpha=1.0, beta=1.0):
    """
    Computes the overall unsupervised objective function loss as a combination of top-k similarity loss,
    alpha-scaled pairwise similarity loss, and beta-scaled regularization loss.
    
    Args:
        original_embeddings: A tensor of shape (batch_size, embedding_dim) representing the original embeddings.
        matryoshka_embeddings: A tensor of shape (batch_size, mat_embedding_dim) representing the matryoshka embeddings.
        k: The number of top similar embeddings to consider for the top-k similarity loss.
        alpha: A scaling factor for the pairwise similarity loss.
        beta: A scaling factor for the regularization loss.
        
    Returns:
        total_loss: A scalar tensor representing the combined unsupervised objective function loss.
    """
    # Compute the individual loss components
    topk_loss = topk_similarity_loss(original_embeddings, matryoshka_embeddings, k)
    pairwise_loss = pairwise_similarity_loss(original_embeddings, matryoshka_embeddings)
    reg_loss = regularization_loss(original_embeddings, matryoshka_embeddings, beta)
    
    # Combine the losses with the given scaling factors
    total_loss = topk_loss + alpha * pairwise_loss + beta * reg_loss
    
    return total_loss

## Training of Adaptor

### Unsupervised Implementation

#### Prepare datasets

We will use BEIR's NFCorpus and stick to the same train-test split pairs as in the paper

In [15]:
from datasets import load_dataset
from torch.utils.data import DataLoader, random_split
import torch

# Load the dataset
corpus_ds = load_dataset("BeIR/nfcorpus", "corpus")

# Access the 'corpus' dataset
dataset = corpus_ds['corpus']['text']

# Define the split sizes
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size

# Split the dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Define batch size
batch_size = 32  # Adjust this as needed

# Create DataLoader for train and test datasets
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Example of iterating through the dataloaders
for batch in train_dataloader:
    print(batch)
    break  # Just for demonstration, remove this in actual training

for batch in test_dataloader:
    print(batch)
    break  # Just for demonstration, remove this in actual testing

['Background: Despite safety reports of the artificial sweetener aspartame, health-related concerns remain. Objective: We prospectively evaluated whether the consumption of aspartame- and sugar-containing soda is associated with risk of hematopoetic cancers. Design: We repeatedly assessed diet in the Nurses’ Health Study (NHS) and Health Professionals Follow-Up Study (HPFS). Over 22 y, we identified 1324 non-Hodgkin lymphomas (NHLs), 285 multiple myelomas, and 339 leukemias. We calculated incidence RRs and 95% CIs by using Cox proportional hazards models. Results: When the 2 cohorts were combined, there was no significant association between soda intake and risks of NHL and multiple myeloma. However, in men, ≥1 daily serving of diet soda increased risks of NHL (RR: 1.31; 95% CI: 1.01, 1.72) and multiple myeloma (RR: 2.02; 95% CI: 1.20, 3.40) in comparison with men who did not consume diet soda. We observed no increased risks of NHL and multiple myeloma in women. We also observed an une

#### Prepare Embedding Model and Matryoshka-Adaptor

In [18]:
from sentence_transformers import SentenceTransformer

# Embedding Model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Matryoshka-Adaptor
input_output_dim = model.get_sentence_embedding_dimension() # Embedding dimension for model (d in paper)
hidden_dim = input_output_dim # Let hidden layer dimension equal the embedding model dimension
mat_adaptor = MatryoshkaAdaptor(input_output_dim, hidden_dim)

#### Train Matryoshka-Adaptor

In [24]:
import torch
from torch.optim import Adam

def train(model, mat_adaptor, train_loader, loss_fn, kwargs):
    """
    Trains the MatryoshkaAdaptor module using the provided training data.

    Args:
        model: A SentenceTransformer model to generate embeddings.
        mat_adaptor: A MatryoshkaAdaptor module to adapt the embeddings.
        train_loader: A DataLoader object for the training dataset.
        loss_fn: A loss function to compute the loss between original and matryoshka embeddings.
        kwargs: A dictionary containing hyperparameters for training.

    Returns:
        None
    """

    # Unpack the hyperparameters
    epochs = kwargs.get('epochs', 5)
    lr = kwargs.get('lr', 1e-3)
    k = kwargs.get('k', 5) # Top-k similarity loss
    m = kwargs.get('m', 128) # Matryoshka embedding dimension
    alpha = kwargs.get('alpha', 1.0) # Pairwise similarity loss scaling factor (alpha in paper)
    beta = kwargs.get('beta', 1.0)  # Regularization loss scaling factor (beta in paper)


    # Define an optimizer for the MatryoshkaAdaptor parameters
    optimizer = Adam(mat_adaptor.parameters(), lr=lr)
    
    # Set MatryoshkaAdaptor to training mode
    mat_adaptor.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            # Generate embeddings for both texts
            ori_embeddings = model.encode(batch, convert_to_tensor=True)  # model batched embeddings

            # Forward pass embedding through the MatryoshkaAdaptor
            mat_embeddings = mat_adaptor(ori_embeddings)

            # Compute loss
            loss = loss_fn(ori_embeddings, mat_embeddings, k=k, alpha=alpha, beta=beta)

            # Backpropagation
            optimizer.zero_grad()  # Clear previous gradients
            loss.backward()        # Compute gradients
            optimizer.step()        # Update weights

            total_loss += loss.item()
            
            break

        # Print average loss for the epoch
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

In [25]:
hyperparams = {
    'epochs': 5,
    'lr': 1e-3,
    'k': 5,  # Top-k similarity loss
    'm': 128,  # Matryoshka embedding dimension
    'alpha': 1.0,  # Pairwise similarity loss scaling factor (alpha in paper)
    'beta': 1.0  # Regularization loss scaling factor (beta in paper)
}

train(model, mat_adaptor, train_dataloader, unsupervised_objective_fn_loss, hyperparams)

Epoch 1/5, Loss: 1.7495
Epoch 2/5, Loss: 1.0474
Epoch 3/5, Loss: 0.2633
Epoch 4/5, Loss: 0.3473
Epoch 5/5, Loss: 0.3356


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

## BEIR Evaluation

### Unmodified Model Performance for all-MiniLM-L6-v2

> Add blockquote



In [None]:
import mteb
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model_name = 'all-MiniLM-L6-v2_BASE'

# Define the BEIR tasks you want to evaluate on
tasks = mteb.get_tasks(tasks=["NFCorpus"])

# Evaluate the model on the benchmark
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"results/{model_name}")
