In [1]:
import torch
import pickle as pkl
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class Question:
    question: str
    context_index: int
    embedding: torch.Tensor
    transformed_embedding: torch.Tensor

@dataclass
class Context:
    context: str
    context_index: int
    embedding: torch.Tensor
    transformed_embedding: torch.Tensor



@dataclass
class EmbeddedQuestion(Question):
    embedding: torch.Tensor

@dataclass
class EmbeddedContext(Context):
    embedding: torch.Tensor

    def __hash__(self) -> int:
        return hash(self.context_index)

    def __eq__(self, value: object) -> bool:
        return self.context_index == value.context_index

@dataclass
class DataCollection:
    questions: list[EmbeddedQuestion]
    contexts: list[Context]
    metadata: dict

In [20]:
def get_most_relevant_contexts(question: list[Question], n_results, use_transformed = True, index_context_map: dict[int, Context] = {}, only_query=False):
    res = []
    contexts: list[Context] = []
    for q in question:
        if index_context_map[q.context_index] not in contexts:
            contexts.append(index_context_map[q.context_index])
    if use_transformed:
        qs = [q.transformed_embedding.numpy() for q in question]
        if not only_query:
            cs = [c.transformed_embedding.numpy() for c in contexts]
        else:
            cs = cs = [c.embedding.numpy() for c in contexts]
    else:
        qs = [q.embedding.numpy() for q in question]
        cs = [c.embedding.numpy() for c in contexts]
    sims = cosine_similarity(np.array(qs), np.array(cs))
    for sim in sims:
        indices = sim.argsort()[-n_results:]
        temp = []
        for index in indices:
            temp.append(contexts[index])
        res.append(temp)
    return res

def get_correct_top_n(questions, index_context_map, n_result, use_transformed, only_query=False):
    eval = []
    most_relevant = get_most_relevant_contexts(questions, n_result, use_transformed, index_context_map, only_query)
    for question, contexts in zip(questions, most_relevant):
        eval.append(index_context_map[question.context_index] in contexts)
    return np.mean(eval)

def get_avg_correct_similarity(questions: list[EmbeddedQuestion], index_context_map, use_transformed=True, only_query=False):
    sims = []
    for question in questions:
        if use_transformed:
            if only_query:
                sims.append(cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), index_context_map[question.context_index].embedding.reshape(1,-1)).flatten()[0])
            else:
                sims.append(cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), index_context_map[question.context_index].transformed_embedding.reshape(1,-1)).flatten()[0])
        else:
            sims.append(cosine_similarity(question.embedding.numpy().reshape(1,-1), index_context_map[question.context_index].embedding.reshape(1,-1)).flatten()[0])
    return np.mean(sims)

def get_avg_wrong_similarity(questions: list[EmbeddedQuestion], index_context_map, use_transformed = True, only_query=False):
    sims = []
    contexts = set([index_context_map[q.context_index] for q in questions])
    for question in questions:
        
        if use_transformed:
            if only_query:
                wrong_context_embs = [c.embedding for c in filter(lambda x: x.context_index != question.context_index, contexts)]
                res = cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), np.array(wrong_context_embs)).flatten().tolist()
            else:
                wrong_context_embs = [c.transformed_embedding for c in filter(lambda x: x.context_index != question.context_index, contexts)]
                res = cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), np.array(wrong_context_embs)).flatten().tolist()
        else:
            wrong_context_embs = [c.embedding for c in filter(lambda x: x.context_index != question.context_index, contexts)]
            res = cosine_similarity(question.embedding.numpy().reshape(1,-1), np.array(wrong_context_embs)).flatten().tolist()
        sims += res
    return np.mean(sims)            

def get_avg_correct_position(questions: list[EmbeddedQuestion], index_context_map, use_transformed = True, only_query=False):
    contexts = []
    index_context_local_map = {}
    for question in questions:
        if index_context_map[question.context_index] not in contexts:
            index_context_local_map[question.context_index] = len(contexts)
            contexts.append(index_context_map[question.context_index])

    mrrs = []
    if use_transformed:
        if only_query:
            context_embs = [c.embedding for c in contexts]
        else:
            context_embs = [c.transformed_embedding for c in contexts]
        for question in questions:
            indices = cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().argsort().tolist()[::-1]
            index = indices.index(index_context_local_map[question.context_index])
            if index != -1:
                mrrs.append(1/(index+1))
    else:
        context_embs = [c.embedding for c in contexts]
        for question in questions:
            indices = cosine_similarity(question.embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().argsort().tolist()[::-1]
            index = indices.index(index_context_local_map[question.context_index])
            if index != -1:
                mrrs.append(1/(index+1))
    return np.mean(mrrs)
            
def get_avg_distance_to_first_wrong(questions, index_context_map, use_transformed = True, only_query=False):
    distances = []
    contexts = []
    index_context_local_map = {}
    for question in questions:
        if index_context_map[question.context_index] not in contexts:
            index_context_local_map[question.context_index] = len(contexts)
            contexts.append(index_context_map[question.context_index])
    
    if use_transformed:
        if only_query:
            context_embs = [c.embedding for c in contexts]
        else:
            context_embs = [c.transformed_embedding for c in contexts]
        for question in questions:
            sims = cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().tolist()
            indices = cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().argsort().tolist()[::-1]
            index = indices.index(index_context_local_map[question.context_index])
            if index == 0:
                distances.append(sims[index_context_local_map[question.context_index]] - sims[indices[1]])
            else:
                distances.append(sims[index_context_local_map[question.context_index]] - sims[indices[0]])
    else:
        context_embs = [c.embedding for c in contexts]
        for question in questions:
            sims = cosine_similarity(question.embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().tolist()
            indices = cosine_similarity(question.embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().argsort().tolist()[::-1]
            index = indices.index(index_context_local_map[question.context_index])
            if index == 0:
                distances.append(sims[index_context_local_map[question.context_index]] - sims[indices[1]])
            else:
                distances.append(sims[index_context_local_map[question.context_index]] - sims[indices[0]])
    
    return np.mean(distances)

def get_class_result(questions, index_context_map, use_transformed = True):
    contexts = []
    index_context_local_map = {}
    for question in questions:
        if index_context_map[question.context_index] not in contexts:
            index_context_local_map[question.context_index] = len(contexts)
            contexts.append(index_context_map[question.context_index])
    if use_transformed:
        context_embs = [c.transformed_embedding for c in contexts]
        for question in questions:
            sims = cosine_similarity(question.transformed_embedding.numpy().reshape(1,-1), np.array(context_embs)).flatten().tolist()
            for sim in sims:
                pass




In [109]:
experiment_names = ["expanded-transform", "dimensionality-reduction", "single-layer-squared-0.2-dropout-long-train-low-margin", "single-layer-squared-no-dropout-long-train-low-margin"]


In [110]:
model_names = ["BAAI-bge-small-en-v1.5", "Cohere-embed-english-v3.0", "text-embedding-3-large", "text-embedding-ada-002"]

In [113]:
from glob import glob
import pathlib

class_deltas = []
mrr_deltas = []
distance_deltas = []
result_df_entries = []
for experiment_name in experiment_names:
    for model_name in model_names:
        for dataset in list(glob(f"./data/train_experiments/{experiment_name}/{model_name}/*.pkl")):
            print(dataset)
            dataset_name = pathlib.Path(dataset).name.replace(".pkl", "")
            # if "dolly" not in dataset and "sciq" not in dataset:
                # continue
            with open(dataset, "rb") as file:
                collection: DataCollection = pkl.load(file)
                print(len(collection.questions))
                index_context_map = {c.context_index:c for c in collection.contexts}
                acc_1_trans = get_correct_top_n(questions=collection.questions, index_context_map=index_context_map, n_result=1, use_transformed=True, only_query=False)
                acc_1_orig = get_correct_top_n(questions=collection.questions, index_context_map=index_context_map, n_result=1, use_transformed=False, only_query=False)
                acc_3_trans = get_correct_top_n(questions=collection.questions, index_context_map=index_context_map, n_result=3, use_transformed=True, only_query=False)
                acc_3_orig = get_correct_top_n(questions=collection.questions, index_context_map=index_context_map, n_result=3, use_transformed=False, only_query=False)
                acc_5_trans = get_correct_top_n(questions=collection.questions, index_context_map=index_context_map, n_result=5, use_transformed=True, only_query=False)
                acc_5_orig = get_correct_top_n(questions=collection.questions, index_context_map=index_context_map, n_result=5, use_transformed=False, only_query=False)
                mrr_trans = get_avg_correct_position(questions=collection.questions, index_context_map=index_context_map, use_transformed=True, only_query=False)
                mrr_orig = get_avg_correct_position(questions=collection.questions, index_context_map=index_context_map, use_transformed=False, only_query=False)
                distance_to_first_wrong_trans = get_avg_distance_to_first_wrong(collection.questions, index_context_map, True, False)
                distance_to_first_wrong_orig = get_avg_distance_to_first_wrong(collection.questions, index_context_map, False, False)
                distance_deltas.append(distance_to_first_wrong_trans - distance_to_first_wrong_orig)
                class_deltas.append(acc_1_trans-acc_1_orig)
                mrr_deltas.append(mrr_trans-mrr_orig)
                print("Correct percentage:",acc_1_trans, acc_1_orig)
                # print("Avg correct sim:", get_avg_correct_similarity(collection.questions, index_context_map, True), get_avg_correct_similarity(collection.questions, index_context_map, False, False))
                # print("Avg wrong sim:", get_avg_wrong_similarity(collection.questions, index_context_map, True), get_avg_wrong_similarity(collection.questions, index_context_map, False, False))
                print("Avg distance:", distance_to_first_wrong_trans, distance_to_first_wrong_orig)
            result_df_entries.append({"experiment_name":experiment_name,"model_name":model_name, "dataset":dataset_name, "top_1_accuracy_trans":acc_1_trans, "top_1_accuracy_orig":acc_1_orig, "top_3_accuracy_trans":acc_3_trans,"top_3_accuracy_orig":acc_3_orig,"top_5_accuracy_trans":acc_5_trans,"top_5_accuracy_orig":acc_5_orig,
                                      "mrr_trans": mrr_trans, "mrr_orig":mrr_orig, "distance_to_first_wrong_trans":distance_to_first_wrong_trans, "distance_to_first_wrong_orig":distance_to_first_wrong_orig})

print(np.mean(class_deltas), np.std(class_deltas))
print(np.mean(mrr_deltas), np.std(mrr_deltas))
print(np.mean(distance_deltas), np.std(distance_deltas))


./data/train_experiments/expanded-transform/BAAI-bge-small-en-v1.5/2008_Sichuan_earthquake.pkl
74
Correct percentage: 0.8918918918918919 0.8918918918918919
Avg distance: 0.13760679562550945 0.06238058692700154
./data/train_experiments/expanded-transform/BAAI-bge-small-en-v1.5/Antarctica.pkl
35
Correct percentage: 0.8857142857142857 0.8857142857142857
Avg distance: 0.21729212360722677 0.09441999367305211
./data/train_experiments/expanded-transform/BAAI-bge-small-en-v1.5/Beyoncé.pkl
117
Correct percentage: 0.8632478632478633 0.7948717948717948
Avg distance: 0.14216968179163006 0.05112811158864926
./data/train_experiments/expanded-transform/BAAI-bge-small-en-v1.5/dolly.pkl
266
Correct percentage: 0.9774436090225563 0.9699248120300752
Avg distance: 0.30644593466269343 0.18065410157791653
./data/train_experiments/expanded-transform/BAAI-bge-small-en-v1.5/Frédéric_Chopin.pkl
136
Correct percentage: 0.8897058823529411 0.7941176470588235
Avg distance: 0.15169186055358938 0.05070231000290198
./

In [114]:
import pandas as pd
df = pd.DataFrame.from_records(result_df_entries)

In [127]:
df

Unnamed: 0,experiment_name,model_name,dataset,top_1_accuracy_trans,top_1_accuracy_orig,top_3_accuracy_trans,top_3_accuracy_orig,top_5_accuracy_trans,top_5_accuracy_orig,mrr_trans,mrr_orig,distance_to_first_wrong_trans,distance_to_first_wrong_orig
0,expanded-transform,BAAI-bge-small-en-v1.5,2008_Sichuan_earthquake,0.891892,0.891892,0.972973,0.972973,0.986486,1.000000,0.932995,0.936261,0.137607,0.062381
1,expanded-transform,BAAI-bge-small-en-v1.5,Antarctica,0.885714,0.885714,1.000000,1.000000,1.000000,1.000000,0.942857,0.942857,0.217292,0.094420
2,expanded-transform,BAAI-bge-small-en-v1.5,Beyoncé,0.863248,0.794872,0.982906,0.974359,1.000000,1.000000,0.922650,0.887322,0.142170,0.051128
3,expanded-transform,BAAI-bge-small-en-v1.5,dolly,0.977444,0.969925,0.992481,0.992481,0.996241,0.996241,0.988252,0.978822,0.306446,0.180654
4,expanded-transform,BAAI-bge-small-en-v1.5,Frédéric_Chopin,0.889706,0.794118,0.963235,0.963235,0.977941,0.970588,0.929298,0.879926,0.151692,0.050702
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,single-layer-squared-no-dropout-long-train-low...,text-embedding-ada-002,Frédéric_Chopin,0.816176,0.808824,0.970588,0.970588,0.992647,0.992647,0.887561,0.884375,0.115738,0.020766
140,single-layer-squared-no-dropout-long-train-low...,text-embedding-ada-002,Hunting,0.883333,0.916667,0.983333,0.983333,1.000000,1.000000,0.934722,0.954167,0.186377,0.041490
141,single-layer-squared-no-dropout-long-train-low...,text-embedding-ada-002,Pharmaceutical_industry,0.962963,0.907407,1.000000,1.000000,1.000000,1.000000,0.975309,0.947531,0.262266,0.063922
142,single-layer-squared-no-dropout-long-train-low...,text-embedding-ada-002,sciq-large,0.937888,0.937888,0.987578,0.981366,0.987578,0.987578,0.958823,0.958293,0.091831,0.068685


In [162]:
df['distance_delta'] = df['distance_to_first_wrong_trans'] - df['distance_to_first_wrong_orig']

In [166]:
df.groupby(["experiment_name", "model_name"])[["avg_", "distance_to_first_wrong_orig", "distance_delta"]].std().round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,distance_to_first_wrong_trans,distance_to_first_wrong_orig,distance_delta
experiment_name,model_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dimensionality-reduction,BAAI-bge-small-en-v1.5,0.073,0.043,0.042
dimensionality-reduction,Cohere-embed-english-v3.0,0.067,0.065,0.029
dimensionality-reduction,text-embedding-3-large,0.08,0.089,0.025
dimensionality-reduction,text-embedding-ada-002,0.07,0.023,0.048
expanded-transform,BAAI-bge-small-en-v1.5,0.065,0.043,0.028
expanded-transform,Cohere-embed-english-v3.0,0.068,0.065,0.026
expanded-transform,text-embedding-3-large,0.08,0.089,0.028
expanded-transform,text-embedding-ada-002,0.069,0.023,0.047
single-layer-squared-0.2-dropout-long-train-low-margin,BAAI-bge-small-en-v1.5,0.076,0.043,0.044
single-layer-squared-0.2-dropout-long-train-low-margin,Cohere-embed-english-v3.0,0.062,0.065,0.023


In [75]:
df.agg(
    top_1_mean=('top_n_accuracy_trans', np.mean),
    top_1_std=('top_n_accuracy_trans', np.std),
    top_1_mean_orig=('top_n_accuracy_orig', np.mean),
    top_1_std_orig=('top_n_accuracy_orig', np.std)
)

  df.agg(
  df.agg(


Unnamed: 0,top_n_accuracy_trans,top_n_accuracy_orig
top_1_mean,0.983707,
top_1_std,0.019817,
top_1_mean_orig,,0.984277
top_1_std_orig,,0.017396


In [150]:
squad_df = df[(df['dataset'] != "dolly") & (df['dataset'] != "sciq") & (df['dataset'] != "sciq-large")]
squad_df['first_wrong_delta'] = squad_df['distance_to_first_wrong_trans'] - squad_df['distance_to_first_wrong_orig']
squad_df.groupby("model_name").agg(
    mean=('first_wrong_delta', np.mean),
    std=('first_wrong_delta', np.std)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squad_df['first_wrong_delta'] = squad_df['distance_to_first_wrong_trans'] - squad_df['distance_to_first_wrong_orig']
  squad_df.groupby("model_name").agg(
  squad_df.groupby("model_name").agg(


Unnamed: 0_level_0,mean,std
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
BAAI-bge-small-en-v1.5,0.103989,0.027972
Cohere-embed-english-v3.0,0.071806,0.016947
text-embedding-3-large,0.055663,0.014745
text-embedding-ada-002,0.118148,0.046806


In [11]:
sciq_df = df[df['dataset'] == "sciq-large"]
sciq_df['first_wrong_delta'] = sciq_df['distance_to_first_wrong_trans'] - sciq_df['distance_to_first_wrong_orig']
sciq_df.groupby("model_name").agg(
    mean=('first_wrong_delta', np.mean),
    std=('first_wrong_delta', np.std),
    mean_fw_trans=('distance_to_first_wrong_trans', np.mean),
    mean_fw_orig=('distance_to_first_wrong_orig', np.mean),
    mean_class_trans=('top_n_accuracy_trans', np.mean),
    mean_class_orig=('top_n_accuracy_orig', np.mean)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sciq_df['first_wrong_delta'] = sciq_df['distance_to_first_wrong_trans'] - sciq_df['distance_to_first_wrong_orig']
  sciq_df.groupby("model_name").agg(
  sciq_df.groupby("model_name").agg(
  sciq_df.groupby("model_name").agg(


Unnamed: 0_level_0,mean,std,mean_fw_trans,mean_fw_orig,mean_class_trans,mean_class_orig
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BAAI-bge-small-en-v1.5,0.0711,,0.184905,0.113805,0.893333,0.903333
Cohere-embed-english-v3.0,0.033046,,0.209763,0.176718,0.913333,0.926667
text-embedding-3-large,0.024151,,0.243961,0.21981,0.953333,0.956667
text-embedding-ada-002,0.149558,,0.211682,0.062124,0.936667,0.936667
