In [2]:
import glob
import pickle
import pathlib

In [3]:
import torch
from dataclasses import dataclass
from typing import Optional

@dataclass
class Question:
    question: str
    context_index: int
    embedding: Optional[torch.tensor] = None
    transformed_embedding: Optional[torch.tensor] = None

@dataclass
class Context:
    context: str
    context_index: int
    embedding: Optional[torch.tensor] = None
    transformed_embedding: Optional[torch.tensor] = None

@dataclass
class DataCollection:
    questions: list[Question]
    contexts: list[Context]
    metadata: dict


## OpenAI text-embedding-3-large embedding

In [8]:
import openai

In [9]:
client = openai.Client(api_key="API KEY")

In [20]:
for path in list(glob.glob("./data/raw/*.pkl")):
    print(path)
    if "sciq-large" not in path:
        continue
    with open(path, 'rb') as file:
       data_collection: DataCollection = pickle.load(file) 
    questions = [q.question for q in data_collection.questions]
    contexts = [c.context for c in data_collection.contexts]
    for chunk in range(0, len(questions),1000):
        question_embs = client.embeddings.create(
            input=questions[chunk:chunk+1000],
            model="text-embedding-3-large"
        )
        for q, e in zip(data_collection.questions, question_embs.data):
            q.embedding = torch.tensor(e.embedding)
    for chunk in range(0, len(contexts),1000):
        context_embs = client.embeddings.create(
            input=contexts[chunk:chunk+1000],
            model="text-embedding-3-large"
        )
        for c, e in zip(data_collection.contexts, context_embs.data):
            c.embedding = torch.tensor(e.embedding)
    if not pathlib.Path("data/embedded/text-embedding-3-large/").exists():
        pathlib.Path("data/embedded/text-embedding-3-large/").mkdir(parents=True)
    with open(f"data/embedded/text-embedding-3-large/{data_collection.metadata['category']}-embedded.pkl", "wb") as file:
        pickle.dump(data_collection, file)

./data/raw/2008_Sichuan_earthquake-base.pkl
./data/raw/Antarctica-base.pkl
./data/raw/Beyoncé-base.pkl
./data/raw/dolly-base.pkl
./data/raw/Frédéric_Chopin-base.pkl
./data/raw/Hunting-base.pkl
./data/raw/Pharmaceutical_industry-base.pkl
./data/raw/sciq-base.pkl
./data/raw/sciq-large-base.pkl


## OpenAI text-embedding-ada-002

In [21]:
for path in glob.glob("./data/raw/*.pkl"):
    print(path)
    if "sciq-large" not in path:
        continue
    with open(path, 'rb') as file:
       data_collection: DataCollection = pickle.load(file) 
    questions = [q.question for q in data_collection.questions]
    contexts = [c.context for c in data_collection.contexts]
    for chunk in range(0, len(questions),1000):
        question_embs = client.embeddings.create(
            input=questions[chunk:chunk+1000],
            model="text-embedding-ada-002"
        )
        for q, e in zip(data_collection.questions, question_embs.data):
            q.embedding = torch.tensor(e.embedding)
    for chunk in range(0, len(contexts),1000):
        context_embs = client.embeddings.create(
            input=contexts[chunk:chunk+1000],
            model="text-embedding-ada-002"
        )
        for c, e in zip(data_collection.contexts, context_embs.data):
            c.embedding = torch.tensor(e.embedding)
    if not pathlib.Path("data/embedded/text-embedding-ada-002/").exists():
        pathlib.Path("data/embedded/text-embedding-ada-002/").mkdir(parents=True)
    with open(f"data/embedded/text-embedding-ada-002/{data_collection.metadata['category']}-embedded.pkl", "wb") as file:
        pickle.dump(data_collection, file)

./data/raw/2008_Sichuan_earthquake-base.pkl
./data/raw/Antarctica-base.pkl
./data/raw/Beyoncé-base.pkl
./data/raw/dolly-base.pkl
./data/raw/Frédéric_Chopin-base.pkl
./data/raw/Hunting-base.pkl
./data/raw/Pharmaceutical_industry-base.pkl
./data/raw/sciq-base.pkl
./data/raw/sciq-large-base.pkl


## Cohere-embed-english-v3.0

In [5]:
import cohere

In [6]:
client = cohere.Client("API KEY")

In [7]:
for path in glob.glob("./data/raw/*.pkl"):
    print(path)
    
    with open(path, 'rb') as file:
        data_collection: DataCollection = pickle.load(file)
    questions = [q.question for q in data_collection.questions]
    contexts = [c.context for c in data_collection.contexts]
    question_embs = client.embed(
        texts=questions,
        model="embed-english-v3.0",
        input_type="search_query"
    )
    context_embs = client.embed(
        texts=contexts,
        model="embed-english-v3.0",
        input_type="search_document"
    )
    for q,e in zip(data_collection.questions, question_embs.embeddings):
        q.embedding = torch.tensor(e)
    for c,e in zip(data_collection.contexts, context_embs.embeddings):
        c.embedding = torch.tensor(e)
    if not pathlib.Path("data/embedded/Cohere-embed-english-v3.0/").exists():
        pathlib.Path("data/embedded/Cohere-embed-english-v3.0/").mkdir(parents=True)
    with open(f"data/embedded/Cohere-embed-english-v3.0/{data_collection.metadata['category']}-embedded.pkl", "wb") as file:
        pickle.dump(data_collection, file)

./data/raw/2008_Sichuan_earthquake-base.pkl
./data/raw/Antarctica-base.pkl
./data/raw/Beyoncé-base.pkl
./data/raw/dolly-base.pkl
./data/raw/Frédéric_Chopin-base.pkl
./data/raw/Hunting-base.pkl
./data/raw/Pharmaceutical_industry-base.pkl
./data/raw/sciq-base.pkl
./data/raw/sciq-large-base.pkl


## BAAI/bge-small-en-v1.5

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
for path in glob.glob("./data/raw/*.pkl"):
    print(path)
    

    with open(path, 'rb') as file:
        data_collection: DataCollection = pickle.load(file)
    instruction = "Represent this sentence for searching relevant passages: "
    questions = [instruction+q.question for q in data_collection.questions]
    contexts = [c.context for c in data_collection.contexts]
    question_embs = model.encode(questions, normalize_embeddings=True)
    context_embs = model.encode(contexts, normalize_embeddings=True)
    for q,e in zip(data_collection.questions, question_embs):
        q.embedding = torch.tensor(e)
    for c,e in zip(data_collection.contexts, context_embs):
        c.embedding = torch.tensor(e)
    if not pathlib.Path("data/embedded/BAAI-bge-small-en-v1.5/").exists():
        pathlib.Path("data/embedded/BAAI-bge-small-en-v1.5/").mkdir(parents=True)
    with open(f"data/embedded/BAAI-bge-small-en-v1.5/{data_collection.metadata['category']}-embedded.pkl", "wb") as file:
        pickle.dump(data_collection, file)

./data/raw/2008_Sichuan_earthquake-base.pkl
./data/raw/Antarctica-base.pkl
./data/raw/Beyoncé-base.pkl
./data/raw/dolly-base.pkl
./data/raw/Frédéric_Chopin-base.pkl
./data/raw/Hunting-base.pkl
./data/raw/Pharmaceutical_industry-base.pkl
./data/raw/sciq-base.pkl
./data/raw/sciq-large-base.pkl


##