In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')

Mounted at /content/drive


In [None]:
!pip install accelerate==0.31.0
!pip install annoy
!pip install jsonlines
!pip install transformers
!pip install auto-gptq
!pip install --upgrade accelerate
!pip install langchain_cohere
!pip install langchain_openai
!pip install paddlepaddle
!pip install langchain_chroma

In [None]:
from typing import List

from transformers import AutoTokenizer
from embeddings import CustomEmbeddings
import jsonlines
from pathlib import Path
import numpy as np
import pandas as pd
import os
from vectorstores import VectorDatabase

metamorphic = ['word_swap', 'obj_sub', 'verb_sub', 'nega_exp', 'word_del', 'num_sub', 'err_translate', 'err_nli']
distance_metrics = ['cosine', 'euclidean', 'person', 'manhattan', 'lancewilliams', 'mahalanobis', 'braycurtis']


def load_dataset(path):
    df = pd.read_json(path, lines=True)
    return df[['sentence1', 'sentence2', 'sentence3']].values.tolist()


vector_dbs = ['Annoy', 'ScanNN', 'Chroma']
distance_metric = None

embedding = CustomEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", subsets=metamorphic)


In [None]:
openai_embeddings = CustomEmbeddings(model_name="ada002", subsets=metamorphic)

In [None]:
acc_all = []
for vector_db in ['Annoy']:
  for data_type in ['normal', 'variant']:
    all_dataset = []
    for me in metamorphic:
        dataset = load_dataset('data/MeTMaP/dataset/'+ data_type +'/'+me+'.jsonl')
        all_dataset.append(dataset)

    acc = 0
    for dataset in all_dataset:
        for b, p, n in dataset:
          # print(vector_db)
          vb = VectorDatabase([p, n],
                        embedding, vector_db)
          candidates = vb.simulate_retrieval(b)
          for c in candidates:
            if c.page_content == p:
              acc += 1
              break
            elif c.page_content == n:
              break

    acc /= len(all_dataset) * 5000
    acc_all.append(acc)




In [None]:
acc_all

[0.377175, 0.997175]

In [None]:
chroma_embeddings = CustomEmbeddings(model_name="sentence-transformers_all-MiniLM-L6-v2", subsets=metamorphic)

In [None]:
acc_all = []
for vector_db in ['Chroma']:
  for data_type in ['normal', 'variant']:
    all_dataset = []
    for me in metamorphic:
        dataset = load_dataset('data/MeTMaP/dataset/'+ data_type +'/'+me+'.jsonl')
        all_dataset.append(dataset)

    acc = 0
    for dataset in all_dataset:
        for b, p, n in dataset:
          vb = VectorDatabase([p, n],
                        chroma_embeddings, vector_db)
          candidates = vb.simulate_retrieval(b)
          for c in candidates:
            if c.page_content == p:
              acc += 1
              break
            elif c.page_content == n:
              break
        print(acc)

    acc /= len(all_dataset) * 5000
    acc_all.append(acc)