In [1]:
import re
from typing import Dict, List
import uuid
import time
from python.embed_anything import _embed_anything
from python.embed_anything import (
    EmbedConfig,
    BertConfig,
    EmbedData,
    EmbeddingModel,
    WhichModel,
    TextEmbedConfig,
)
from python.embed_anything.vectordb import Adapter
from pinecone import Pinecone, ServerlessSpec
import os

  from tqdm.autonotebook import tqdm


In [2]:
model = EmbeddingModel.from_pretrained_local(
    WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
)
config = TextEmbedConfig(chunk_size=200, batch_size=32)

In [3]:
model = EmbeddingModel.from_pretrained_cloud(
    WhichModel.OpenAI, model_id="text-embedding-3-small"
)

In [4]:
data = _embed_anything.embed_query(["hello", "hi"], embeder=model, config=config)

data

{"total_tokens": 2, "prompt_tokens": 2}


[<class 'EmbedData'>, <class 'EmbedData'>]

In [5]:
model = EmbeddingModel.from_pretrained_local(
    WhichModel.Clip, model_id="openai/clip-vit-base-patch32", revision="refs/pr/15"
)
data = _embed_anything.embed_directory("../test_files", embeder=model)

In [6]:
data[0].metadata

{'file_name': '/home/akshay/EmbedAnything/test_files/clip/dog2.jpeg'}

In [27]:
model = EmbeddingModel.from_pretrained_cloud(
    WhichModel.OpenAI, model_id="text-embedding-3-small"
)
config = TextEmbedConfig(chunk_size=200, batch_size=32)
data = _embed_anything.embed_directory(
    "../test_files", embeder=model, extensions=["pdf"], config=config
)

In [7]:
class PineconeAdapter(Adapter):
    """
    Adapter class for interacting with Pinecone, a vector database service.
    """

    def __init__(self, api_key: str):
        """
        Initializes a new instance of the PineconeAdapter class.

        Args:
            api_key (str): The API key for accessing the Pinecone service.
        """
        super().__init__(api_key)
        self.pc = Pinecone(api_key=self.api_key)
        self.index_name = None

    def create_index(
        self,
        dimension: int,
        metric: str = "cosine",
        index_name: str = "anything",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    ):
        """
        Creates a new index in Pinecone.

        Args:
            dimension (int): The dimensionality of the embeddings.
            metric (str, optional): The distance metric to use for similarity search. Defaults to "cosine".
            index_name (str, optional): The name of the index. Defaults to "anything".
            spec (ServerlessSpec, optional): The serverless specification for the index. Defaults to AWS in us-east-1 region.
        """
        self.index_name = index_name
        self.pc.create_index(
            name=index_name, dimension=dimension, metric=metric, spec=spec
        )

    def delete_index(self, index_name: str):
        """
        Deletes an existing index from Pinecone.

        Args:
            index_name (str): The name of the index to delete.
        """
        self.pc.delete_index(name=index_name)

    def convert(self, embeddings: List[EmbedData]) -> List[Dict]:
        """
        Converts a list of embeddings into the required format for upserting into Pinecone.

        Args:
            embeddings (List[EmbedData]): The list of embeddings to convert.

        Returns:
            List[Dict]: The converted data in the required format for upserting into Pinecone.
        """
        data_emb = []

        for embedding in embeddings:
            data_emb.append(
                {
                    "id": str(uuid.uuid4()),
                    "values": embedding.embedding,
                    "metadata": {
                        "text": embedding.text,
                        "file": re.split(
                            r"/|\\", embedding.metadata.get("file_name", "")
                        )[-1],
                    },
                }
            )
        return data_emb

    def upsert(self, data: List[Dict]):
        """
        Upserts data into the specified index in Pinecone.

        Args:
            data (List[Dict]): The data to upsert into Pinecone.

        Raises:
            ValueError: If the index has not been created before upserting data.
        """

        data = self.convert(data)

        if not self.index_name:
            raise ValueError("Index must be created before upserting data")
        self.pc.Index(name=self.index_name).upsert(data)

In [8]:
bert_config = BertConfig(
    model_id="sentence-transformers/all-MiniLM-L12-v2", chunk_size=100
)
embed_config = EmbedConfig(bert=bert_config)

In [9]:
api_key = os.environ.get("PINECONE_API_KEY")
index_name = "anything"
pinecone_adapter = PineconeAdapter(api_key)

try:
    pinecone_adapter.delete_index("anything")
except:
    pass

pinecone_adapter.create_index(dimension=384, metric="cosine")

In [None]:
# data = embed_anything.embed_file(
#     "../test_files/test.pdf",
#     embeder="Bert",
#     config=embed_config,
#     adapter=pinecone_adapter,
# )

data = _embed_anything.embed_directory(
    "../test_files",
    embeder="Bert",
    extensions=["pdf"],
    config=embed_config,
    adapter=pinecone_adapter,
)

Unicode mismatch true f_i "fi" Ok("ﬁ") [64257]
Unicode mismatch true f_f_i "ffi" Ok("ﬃ") [64259]
Unicode mismatch true f_f "ff" Ok("ﬀ") [64256]
Unicode mismatch true f_l "fl" Ok("ﬂ") [64258]
Unicode mismatch true f_i "fi" Ok("ﬁ") [64257]
Unicode mismatch true f_f_i "ffi" Ok("ﬃ") [64259]
Unicode mismatch true f_f "ff" Ok("ﬀ") [64256]
Unicode mismatch true f_i "fi" Ok("ﬁ") [64257]
Unicode mismatch true f_f_i "ffi" Ok("ﬃ") [64259]
Unicode mismatch true f_f "ff" Ok("ﬀ") [64256]
Unicode mismatch true f_i "fi" Ok("ﬁ") [64257]
Unicode mismatch true f_f_i "ffi" Ok("ﬃ") [64259]
Unicode mismatch true f_f "ff" Ok("ﬀ") [64256]
Unicode mismatch true f_l "fl" Ok("ﬂ") [64258]
Unicode mismatch true f_i "fi" Ok("ﬁ") [64257]
Unicode mismatch true f_f_i "ffi" Ok("ﬃ") [64259]
Unicode mismatch true f_f "ff" Ok("ﬀ") [64256]
Unicode mismatch true f_i "fi" Ok("ﬁ") [64257]
Unicode mismatch true f_f_i "ffi" Ok("ﬃ") [64259]
Unicode mismatch true f_f "ff" Ok("ﬀ") [64256]
missing char 33 in unicode map {} for <</T

In [None]:
print(data)
print(pinecone_adapter.pc.Index(name="anything").list_paginated())

None
{'namespace': '',
 'pagination': {'next': 'eyJza2lwX3Bhc3QiOiIyZmE2ZWYyMS0xYzM0LTQ2ZjgtOTJmNS0wZWMyNzBlM2Y0NTAiLCJwcmVmaXgiOm51bGx9'},
 'usage': {'read_units': 1},
 'vectors': [{'id': '00189a52-6d44-4aef-8e82-d6a444e7c097'},
             {'id': '009eede6-d324-4659-98ba-aab2b90017fd'},
             {'id': '00e4c836-c849-4260-80e0-0f2c70388ff1'},
             {'id': '018cdc26-e5d6-4cbf-9382-4722c7ed305e'},
             {'id': '030c3110-2c39-4f7e-b641-ac05953a659e'},
             {'id': '0327215e-4a60-4a4a-9b51-272471acd422'},
             {'id': '03e75ee7-17ef-4d14-a469-79e9d0dece90'},
             {'id': '044762a5-07f2-42b6-a5ae-82bae0ad632e'},
             {'id': '06ab40d8-49ae-410b-b724-7ec71378cb8b'},
             {'id': '06ea9909-3e69-4823-8ff4-341fea4e011b'},
             {'id': '07eef4ce-0599-4e89-970d-ec10ebc3f406'},
             {'id': '08c39e91-a17d-4f58-be5d-0feed3ff1697'},
             {'id': '08c474bf-24f5-4dff-8ccb-c9c631a4c9ef'},
             {'id': '0a524f15-1d8e-459