In [2]:
!pip install openai
!pip install sentence-transformers
!pip install -U langchain-openai

In [7]:
!pip install langchain_cohere

In [8]:
!pip install transformers

In [None]:
import getpass
import os

os.environ["COHERE_API_KEY"] = getpass.getpass()
os.environ["OPENAI_API_KEY"] = getpass.getpass()

#### We will evaluate different embedding models by calculating the cosine similarity of pairs of sentences. The evaluation will then be done using statistical metrics such as Spearman correlation, Pearson correlation, and Mean Squared Error (MSE).

### The embedding models are:
#### 1. OpenAI embeddings, default, ada
#### 2. Cohere embeddings, default
#### 3. HuggingFace BAAI/bge-base-en-v1.5
#### 4. Sentence transformers miniLM
#### 5. Sentence transformers Matryoshka embeddings


In [21]:
import pandas as pd
import numpy as np
import cohere
import torch
from sentence_transformers import util
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from langchain_cohere import CohereEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from transformers import AutoModel, AutoTokenizer
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

# Models Initialization

In [20]:
model_openai = OpenAIEmbeddings(model="text-embedding-ada-002")
model_cohere = CohereEmbeddings(model="embed-english-light-v3.0")
model_bge = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")
model_minilm = SentenceTransformer("all-MiniLM-L6-v2")
model_matryoshka = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka")

# Matryoshka Embeddings (Example)



In [63]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka")

In [None]:
matryoshka_dim = 64
texts = [
    "The weather is so nice!",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

embeddings = model.encode(texts)
embeddings = embeddings[..., :matryoshka_dim]  # Shrinks the embedding dimensions
print(embeddings.shape)

similarities = cos_sim(embeddings[0], embeddings[1:])
print(similarities)

(3, 64)
tensor([[0.8910, 0.1337]])


# MiniLM and Matroyshka embedding model (Example)

In [65]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

data = pd.DataFrame({
    'sentence1': [
        "A train is moving.",
        "A group of men play soccer on the beach.",
        "One woman is measuring another woman's ankle."
    ],
    'sentence2': [
        "A man is doing yoga.",
        "A group of boys are playing soccer on the beach.",
        "A woman measures another woman's ankle."
    ],
    'score': [0, 3.6, 5.0]
})

model = SentenceTransformer("all-MiniLM-L6-v2")
model2 = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka")

embeddings1 = model.encode(data['sentence1'].tolist(), convert_to_tensor=True)
embeddings2 = model.encode(data['sentence2'].tolist(), convert_to_tensor=True)
cosine_scores = util.cos_sim(embeddings1, embeddings2).diagonal()

# matryoshka_dim = 256
embeddings21 = model2.encode(data['sentence1'].tolist(), convert_to_tensor=True)
embeddings22 = model2.encode(data['sentence2'].tolist(), convert_to_tensor=True)
# embeddings21 = embeddings21[..., :matryoshka_dim]
# embeddings22 = embeddings22[..., :matryoshka_dim]
cosine_scores2 = util.cos_sim(embeddings21, embeddings22).diagonal()

transformed_scores = 5 * abs(cosine_scores)
transformed_scores2 = 5 * abs(cosine_scores2)

data['cos_sim_MiniLM'] = cosine_scores.numpy()
data['MiniLM_Score'] = transformed_scores.cpu().numpy()

data['cos_sim_Matryoshka'] = cosine_scores2.numpy()
data['Matryoshka_Score'] = transformed_scores2.cpu().numpy()


data.head()


Unnamed: 0,sentence1,sentence2,score,cos_sim_MiniLM,MiniLM_Score,cos_sim_Matryoshka,Matryoshka_Score
0,A train is moving.,A man is doing yoga.,0.0,0.072262,0.361308,0.132224,0.661121
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.788625,3.943125,0.826425,4.132123
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,0.946494,4.732468,0.9712,4.856


Note: Cosine similarity between sentence embeddings is always positive



# Cohere Embeddings (Example)

In [None]:
from langchain_cohere import CohereEmbeddings

In [24]:
import pandas as pd
import numpy as np
import cohere
import torch
from sentence_transformers import util

co = cohere.Client('YOUR_COHERE_API_KEY')

data = pd.DataFrame({
    'sentence1': [
        "A man is playing a harp.",
        "A group of men play soccer on the beach.",
        "One woman is measuring another woman's ankle."
    ],
    'sentence2': [
        "A man is playing a keyboard.",
        "A group of boys are playing soccer on the beach.",
        "A woman measures another woman's ankle."
    ],
    'score': [1.5, 3.6, 5.0]
})

def get_embeddings(sentences):
    response = co.embed(model='large', texts=sentences)
    embeddings = np.array(response.embeddings)
    return torch.tensor(embeddings)

embeddings1 = get_embeddings(data['sentence1'].tolist())
embeddings2 = get_embeddings(data['sentence2'].tolist())

cosine_scores = util.cos_sim(embeddings1, embeddings2).diagonal()

transformed_scores = 5 * abs(cosine_scores)

data['cos_sim_Cohere'] = cosine_scores.numpy()
data['Cohere_Score'] = transformed_scores.numpy()

data.head()


Unnamed: 0,sentence1,sentence2,score,cos_sim_Cohere,Cohere_Score
0,A man is playing a harp.,A man is playing a keyboard.,1.5,0.612934,3.064672
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.784461,3.922305
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,0.98314,4.915699


In [None]:
# transformed_scores = 5 * (np.exp(cosine_scores) - np.exp(-1)) / (np.exp(1) - np.exp(-1))

# HF BAAI embeddings (Example)

In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

data = pd.DataFrame({
    'sentence1': [
        "A man is playing a harp.",
        "A group of men play soccer on the beach.",
        "One woman is measuring another woman's ankle."
    ],
    'sentence2': [
        "Einstein went to the moon.",
        "A group of boys are playing soccer on the beach.",
        "A woman measures another woman's ankle."
    ],
    'score': [0, 3.6, 5.0]
})

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
model_hf = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

def encode(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        model_output = model_hf(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = model_output.last_hidden_state[:, 0, :]  # Take the embeddings from the CLS token
    return embeddings

hf_embeddings1 = encode(data['sentence1'].tolist())
hf_embeddings2 = encode(data['sentence2'].tolist())

hf_cosine_scores = util.cos_sim(hf_embeddings1, hf_embeddings2).diagonal()

hf_transformed_scores = 5 * abs(hf_cosine_scores)
data['cos_sim_HF_BAAI'] = hf_cosine_scores.cpu().numpy()
data['HF_BAAI_Score'] = hf_transformed_scores

data.head()


Unnamed: 0,sentence1,sentence2,score,cos_sim_HF_BAAI,HF_BAAI_Score
0,A man is playing a harp.,Einstein went to the moon.,0.0,0.414691,2.073454
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.870663,4.353315
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,0.969653,4.848265


# OpenAI Embeddings (Example)

In [45]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
sentence1 = embeddings.embed_query("A train moving like a dog")
sentence2 = embeddings.embed_query("asdfgafgafa")

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([sentence1], [sentence2])[0][0]

query_sentence1_similarity

(0.7657231319705476,)

# Evaluation of embedding models

In [54]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import cohere
from langchain_openai.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

# for HF BAAI embedding model
def encode(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        model_output = model_hf(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = model_output.last_hidden_state[:, 0, :]  # Take the embeddings from the CLS token
    return embeddings

# for Cohere embedding model
def get_embeddings(sentences):
    response = co.embed(model='large', texts=sentences)
    embeddings = np.array(response.embeddings)
    return torch.tensor(embeddings)

def evaluate_embedding_models(models, pair_sentences):
    results = []

    for _, row in pair_sentences.iterrows():
        sentence1 = row['sentence1']
        sentence2 = row['sentence2']
        score = row['score']

        model_scores = []
        model_transformed_scores = []

        for model_name in models:
            if model_name == 'MiniLM':
                model = SentenceTransformer("all-MiniLM-L6-v2")
                embeddings1 = model.encode([sentence1], convert_to_tensor=True)
                embeddings2 = model.encode([sentence2], convert_to_tensor=True)
            elif model_name == 'Matryoshka':
                model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka")
                embeddings1 = model.encode([sentence1], convert_to_tensor=True)
                embeddings2 = model.encode([sentence2], convert_to_tensor=True)
            elif model_name == 'Cohere':
                co = cohere.Client('YOUR_COHERE_API_KEY')
                embeddings1 = get_embeddings([sentence1])
                embeddings2 = get_embeddings([sentence2])
            elif model_name == 'HF_BAAI':
                tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
                model_hf = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")
                embeddings1 = encode([sentence1])
                embeddings2 = encode([sentence2])
            elif model_name == 'OpenAI':
                embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
                embeddings1 = embeddings.embed_query(sentence1)
                embeddings2 = embeddings.embed_query(sentence2)
                cosine_score = cosine_similarity([embeddings2], [embeddings1])[0][0]

            cosine_score = util.cos_sim(embeddings1, embeddings2).diagonal()[0]
            transformed_score = 5 * abs(cosine_score)

            model_scores.append(cosine_score.item())
            model_transformed_scores.append(transformed_score.item())

        result_row = [sentence1, sentence2, score] + model_scores + model_transformed_scores
        results.append(result_row)

    columns = ['Sentence1', 'Sentence2', 'Score'] + [f'{model}_Cosine_Similarity' for model in models] + [f'{model}_Transformed_Score' for model in models]
    return pd.DataFrame(results, columns=columns)

# Examples from STS-B dataset
pair_sentences = pd.DataFrame({
    'sentence1': [
        "A train is moving.",
        "A group of men play soccer on the beach.",
        "One woman is measuring another woman's ankle.",
        "A man is cutting up a cucumber.",
        "A woman is cutting tofu."
    ],
    'sentence2': [
        "A man is doing yoga.",
        "A group of boys are playing soccer on the beach.",
        "A woman measures another woman's ankle.",
        "A man is slicing a cucumber.",
        "A woman is cutting an onion."
    ],
    'score': [0, 3.6, 5.0, 4.2, 2.4]
})

models = ['MiniLM', 'Matryoshka', 'Cohere', 'HF_BAAI', 'OpenAI']

evaluation_results = evaluate_embedding_models(models, pair_sentences)
evaluation_results.head()


Unnamed: 0,Sentence1,Sentence2,Score,MiniLM_Cosine_Similarity,Matryoshka_Cosine_Similarity,Cohere_Cosine_Similarity,HF_BAAI_Cosine_Similarity,OpenAI_Cosine_Similarity,MiniLM_Transformed_Score,Matryoshka_Transformed_Score,Cohere_Transformed_Score,HF_BAAI_Transformed_Score,OpenAI_Transformed_Score
0,A train is moving.,A man is doing yoga.,0.0,0.072262,0.132224,0.109903,0.468998,0.799453,0.361308,0.661121,0.549513,2.344988,3.997263
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.788625,0.826425,0.7845,0.870663,0.964692,3.943126,4.132123,3.922502,4.353315,4.823462
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,0.946493,0.9712,0.98315,0.969653,0.990364,4.732467,4.856001,4.91575,4.848265,4.951821
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2,0.882043,0.926494,0.951069,0.950997,0.987107,4.410216,4.632472,4.755343,4.754987,4.935534
4,A woman is cutting tofu.,A woman is cutting an onion.,2.4,0.48018,0.50029,0.501915,0.71169,0.90047,2.400901,2.50145,2.509576,3.558448,4.502348


In [60]:
evaluation_results.to_csv('embedding_models_scores.csv', index=False, header=True)


In [57]:
MiniLM_transformed_scores = []
Matryoshka_transformed_scores = []
Cohere_transformed_scores = []
HF_BAAI_transformed_scores = []
OpenAI_transformed_scores = []

for column in evaluation_results.columns:
    if 'MiniLM_Transformed_Score' in column:
        MiniLM_transformed_scores.extend(evaluation_results[column])
    elif 'Matryoshka_Transformed_Score' in column:
        Matryoshka_transformed_scores.extend(evaluation_results[column])
    elif 'Cohere_Transformed_Score' in column:
        Cohere_transformed_scores.extend(evaluation_results[column])
    elif 'HF_BAAI_Transformed_Score' in column:
        HF_BAAI_transformed_scores.extend(evaluation_results[column])
    elif 'OpenAI_Transformed_Score' in column:
        OpenAI_transformed_scores.extend(evaluation_results[column])

MiniLM_transformed_scores

[0.36130785942077637,
 3.9431257247924805,
 4.732467174530029,
 4.410216331481934,
 2.4009008407592773]

In [59]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error

# human annotated score
score = [0.0, 3.6, 5.0, 4.2, 2.4]


def calculate_metrics(human_scores, model_scores):
    pearson_corr, _ = pearsonr(human_scores, model_scores)
    spearman_corr, _ = spearmanr(human_scores, model_scores)
    mse = mean_squared_error(human_scores, model_scores)
    return pearson_corr, spearman_corr, mse

miniLM_metrics = calculate_metrics(score, MiniLM_transformed_scores)
matryoshka_metrics = calculate_metrics(score, Matryoshka_transformed_scores)
cohere_metrics = calculate_metrics(score, Cohere_transformed_scores)
hf_baai_metrics = calculate_metrics(score, HF_BAAI_transformed_scores)
openai_metrics = calculate_metrics(score, OpenAI_transformed_scores)

metrics_df = pd.DataFrame({
    'Model': ['MiniLM', 'Matryoshka', 'Cohere', 'HF_BAAI', 'OpenAI'],
    'Pearson Correlation': [miniLM_metrics[0], matryoshka_metrics[0], cohere_metrics[0], hf_baai_metrics[0], openai_metrics[0]],
    'Spearman Correlation': [miniLM_metrics[1], matryoshka_metrics[1], cohere_metrics[1], hf_baai_metrics[1], openai_metrics[1]],
    'Mean Squared Error': [miniLM_metrics[2], matryoshka_metrics[2], cohere_metrics[2], hf_baai_metrics[2], openai_metrics[2]]
})

print(metrics_df)


        Model  Pearson Correlation  Spearman Correlation  Mean Squared Error
0      MiniLM             0.992813                   1.0            0.072809
1  Matryoshka             0.988837                   1.0            0.187659
2      Cohere             0.991153                   1.0            0.146696
3     HF_BAAI             0.992190                   1.0            1.547897
4      OpenAI             0.987199                   1.0            4.487634


In [61]:
metrics_df.to_csv('embedding_models_evaluation.csv', index=False, header=True)


### 1. Pearson Correlation Coefficient:
- Pearson correlation measures the linear correlation between two variables. It ranges from -1 to 1, where 1 indicates a perfect positive linear correlation, -1 indicates a perfect negative linear correlation, and 0 indicates no linear correlation.
- **Inference**:
    - All models show high Pearson correlation coefficients, ranging from approximately 0.988 to 0.993.
    - The high positive values suggest a strong positive linear relationship between the model-predicted scores and the human-annotated scores.
    - This indicates that the model predictions tend to increase or decrease linearly with the human scores.

### 2. Spearman Correlation Coefficient:
- Spearman correlation measures the strength and direction of association between the ranked values of two variables. It assesses how well the relationship between two variables can be described using a monotonic function.
- **Inference**:
    - All models have a Spearman correlation coefficient of 1.0, indicating perfect rank-order correlation between the model-predicted scores and the human scores.
    - This suggests that the models rank the sentence pairs in the same order as the human annotators, regardless of the magnitude of the scores.

### 3. Mean Squared Error (MSE):
- Mean Squared Error is a measure of the average squared difference between the actual values and the predicted values. It quantifies the average magnitude of the errors between predicted and actual values.
- **Inference**:
    - Lower MSE values indicate better model performance, as they represent smaller discrepancies between predicted and actual scores.
    - MiniLM has the lowest MSE (0.072809), followed by Cohere (0.146696) and HF_BAAI (1.547897).
    - OpenAI has the highest MSE (4.487634), indicating larger discrepancies between its predicted scores and the actual human scores.

## Conclusions:
- All models exhibit strong correlations with the human-annotated scores, suggesting that they capture semantic similarity effectively.
- The high Spearman correlation coefficients indicate that the models rank the sentence pairs in the same order as humans, demonstrating robustness in capturing relative similarities.
- MiniLM performs the best overall, showing the highest Pearson correlation and the lowest MSE.
- Cohere and HF_BAAI also perform well, with high correlation coefficients and relatively low MSE values.
- OpenAI lags behind the other models, with the highest MSE, indicating less precise prediction of similarity scores.



Notes:
* OpenAIEmbeddings, opemai
* "BAAI/bge-base-en-v1.5", huggingface
* tomaarsen/mpnet-base-nli-matryoshka, sentence transformers, 768 dim
* model = SentenceTransformer("all-MiniLM-L6-v2"), 384 dim
* cohere embedding model
*https://www.sbert.net/docs/pretrained_models.html



