In [1]:
import json

import sentence_transformers
import torch
import numpy as np
from sentence_transformers.quantization import quantize_embeddings
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt

from src.pipeline.transformer_dim_reduction import *

  from tqdm.autonotebook import tqdm, trange


In [2]:
def _compare_performance(emb1, emb2):
    # Compare the similarity computation performance of the two models
    sim1 = cosine_similarity(emb1)
    sim2 = cosine_similarity(emb2)
    difference_matrix = np.abs(sim2 - sim1)
    return np.mean(difference_matrix), np.std(difference_matrix)


def edr_eval(train, full_emb, new_dimension: int, model_name: str):
    # Generate embeddings
    print("Generating embeddings ...")
    model = SentenceTransformer(
        model_name,
        device='cuda'
    )

    # PCA on train embeddings
    print("Performing PCA on train embeddings ...")
    pca = PCA(n_components=new_dimension)
    pca.fit(full_emb)
    pca_comp = np.asarray(pca.components_)

    # Add a dense layer to the model
    print("Adding dense layer to the model ...")
    dense = models.Dense(
        in_features=model.get_sentence_embedding_dimension(),
        out_features=new_dimension,
        bias=False,
        activation_function=torch.nn.Identity(),
    )
    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
    model.add_module("dense", dense)

    # Evaluate the model with the reduce embedding size
    print(f"Model with {new_dimension} dimensions:")
    red_emb = model.encode(train, convert_to_numpy=True)
    mean_diff, std_diff = _compare_performance(full_emb, red_emb)
    print(f"Mean difference: {mean_diff}, Std difference: {std_diff}")

    # Store the model on disc
    model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
    os.makedirs("data/models", exist_ok=True)
    model.save(f"data/models/{model_name}-{new_dimension}dim")


def reduce_transformer_dim(base_model_name: str):
    print("Adding a dense layer to the transformer model to reduce the embedding dimension ...")

    # Parse datasets
    abstracts = []
    data = WhoIsWhoDataset.parse_data()

    for paper_id, paper_info in data.items():
        abstracts.append(paper_info['abstract'])

    # Get train, test, valid splits
    print("Splitting data into train, test, valid ...")
    random.shuffle(abstracts)
    train_size = int(0.8 * len(abstracts))
    test_size = int(0.1 * len(abstracts))
    train, test, valid = abstracts[:train_size], abstracts[train_size:train_size + test_size], abstracts[
                                                                                               train_size + test_size:]
    max_samples = int(10000)
    train = train[0:max_samples]

    # Embed train data using full model for comparison
    print('Embedding data using full model ...')
    full_model = sentence_transformers.SentenceTransformer(
        base_model_name,
        device='cuda'
    )
    start = time()
    full_emb = full_model.encode(train, convert_to_numpy=True)
    print(f"Full model embedding time: {time() - start}s")

    # Reduce the dimensionality of the embeddings, evaluate the performance compared to the full model
    edr_eval(
        train,
        full_emb,
        new_dimension=32,
        model_name=base_model_name
    )


In [3]:
reduce_transformer_dim('sentence-transformers/all-MiniLM-L6-v2')

Adding a dense layer to the transformer model to reduce the embedding dimension ...
Splitting data into train, test, valid ...
Embedding data using full model ...




Full model embedding time: 11.061969995498657s
Generating embeddings ...
Performing PCA on train embeddings ...
Adding dense layer to the model ...
Model with 32 dimensions:
Mean difference: 0.06308086216449738, Std difference: 0.06814222037792206
