In [19]:
!pip install embed_anything-gpu




In [20]:
import embed_anything
import os

from typing import Dict, List
from embed_anything import EmbedData
from embed_anything.vectordb import Adapter


In [21]:
!pip install lancedb



In [22]:
from embed_anything.vectordb import Adapter
from uuid import uuid4
import lancedb


In [None]:
# table = self.connection.create_table("docs", docs)

In [None]:
from lancedb.pydantic import Vector, LanceModel

class Item(LanceModel):
    vector: Vector(2)
    item: str
    price: float


In [118]:
class LanceAdapter(Adapter):
    def __init__(self, db_path: str, embedding_dimension: int):
        from lancedb.pydantic import Vector, LanceModel

        self.db_path = db_path
        self.connection = lancedb.connect(self.db_path)
        self.dimension = embedding_dimension
        
        class Metadata(LanceModel):
            file_name: str
            modified: str
            created: str
        class Item(LanceModel):
            embeddings: Vector(self.dimension)
            text: str
            # metadata: Metadata
        self.Metadata = Metadata
        self.Item = Item

    def create_index(self, table_name: str):
        self.table_name = table_name
        self.connection = lancedb.connect(self.db_path)
        self.table = self.connection.create_table(table_name, schema=self.Item.to_arrow_schema())

    def convert(self, embeddings: List[List[EmbedData]]) -> List[Dict]:
        data = []
        for embedding in embeddings:
            data.append(
                {
                    "text": embedding.text,
                    "embeddings": embedding.embedding,
                    # "metadata": self.Metadata(
                    #     file_name=embedding.metadata["file_name"],
                    #     modified=embedding.metadata["modified"],
                    #     created=embedding.metadata["created"],
                    # ),
                }
            )
        return data
    
    def delete_index(self, table_name: str):
        self.connection.drop_table(table_name)

    def upsert(self, data: EmbedData):
        self.table.add(self.convert(data))


In [119]:
lance_adapter.delete_index("docs")

In [120]:
# elasticsearch_adapter = ElasticsearchAdapter(
#     api_key=elastic_api_key,
#     cloud_id=elastic_cloud_id,
#     index_name=index_name,
# )
lance_adapter = LanceAdapter(db_path="tmp/lancedb",  embedding_dimension=384)
lance_adapter.create_index("docs")

In [121]:
from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel

model = EmbeddingModel.from_pretrained_hf(
    WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
)


data = embed_anything.embed_file(
    "/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
    embedder=model,
    adapter=lance_adapter
)

Loading weights from "/home/sonamAI/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L12-v2/snapshots/a05860a77cef7b37e0048a7864658139bc18a854/model.safetensors"


In [122]:
query_vec = embed_anything.embed_query(['attention'], embedder = model)[0].embedding
docs = lance_adapter.table.search(query_vec).limit(5).to_pandas()["text"]

In [123]:
docs

0    2 Background The goal of reducing sequential c...
1    <EOS><pad> Input-Input Layer5TheLawwillneverbe...
2    MultiHead(Q, K, V ) = Concat(head1, ..., headh...
3    In contrast to RNN sequence-to-sequence models...
4    Convolutional layers are generally more expens...
Name: text, dtype: object