Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] feat: add support for LanceDB as vector db provider #210

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fdb8475
fix: upate the basic example client to communiate with server on the …
sp6370 Mar 25, 2024
a694567
Merge branch 'SciPhi-AI:main' into main
sp6370 Mar 25, 2024
da8f46c
chore: add lanceDB as optional dependency
sp6370 Mar 26, 2024
1fdf426
chore: add lanceDB as optional dependency part 2
sp6370 Mar 26, 2024
0d085e0
feat: Add support for LanceDB as Vector DB provider
sp6370 Mar 26, 2024
53a3dd3
chore: added pyarrow as the dependency
sp6370 Mar 26, 2024
081acc5
chore: added implementation for lancedb initialize_collection
sp6370 Mar 26, 2024
7f63a6f
chore: updated the database schema for lancedb
sp6370 Mar 26, 2024
e378820
chore: update dependency for pyarrow
sp6370 Mar 26, 2024
a76bf16
chore: support lancedb selection from config.json
sp6370 Apr 4, 2024
9c87c77
Merge branch 'main' into main
sp6370 Apr 4, 2024
5c0f8f0
Update factory.py
sp6370 Apr 4, 2024
9c276cf
chore: add skeleton code for lancedb provider support
sp6370 Apr 4, 2024
78f6fb9
Merge branch 'main' of https://github.com/sp6370/R2R
sp6370 Apr 4, 2024
b3f9208
Merge branch 'SciPhi-AI:main' into main
sp6370 Apr 4, 2024
ddad47e
Merge branch 'SciPhi-AI:main' into main
sp6370 Apr 8, 2024
b371e69
chore: update .env.example for lancedb
sp6370 Apr 8, 2024
92fb033
chore: update lancedb implementation to set db uri from env
sp6370 Apr 8, 2024
a8b0fe6
feat: update lancedb implementation to support upsert_entries and search
sp6370 Apr 8, 2024
da55720
Merge branch 'main' of https://github.com/sp6370/R2R
sp6370 Apr 8, 2024
024c981
feat: update lancedb implementation to support lancedb cloud
sp6370 Apr 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ LOCAL_DB_PATH=local.sqlite
## QDRANT_PORT=your_qdrant_port
## QDRANT_API_KEY=your_qdrant_api_key

# ## lancedb
## LANCEDB_URI=your_lancedb_uri_local_or_cloud
## LANCEDB_API_KEY=your_lancedb_cloud_api_key
## LANCEDB_REGION=your_lancedb_cloud_region

# LLM Providers

## openai
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ boto3 = {version = "^1.34.71", optional = true}
exa-py = {version = "^1.0.9", optional = true}
llama-cpp-python = {version = "^0.2.57", optional = true}
sentence-transformers = {version = "^2.6.1", optional = true}
lancedb = { version = "^0.6.5", optional = true }

[tool.poetry.extras]
parsing = ["bs4", "pypdf"]
Expand All @@ -60,7 +61,8 @@ ionic = ["ionic-api-sdk"]
reducto = ["boto3"]
exa = ["exa-py"]
local_llm = ["llama-cpp-python", "sentence-transformers"]
all = ["bs4", "pypdf", "tiktoken", "datasets", "qdrant_client", "psycopg2-binary", "sentry-sdk", "parea-ai", "boto3", "exa-py", "llama-cpp-python"]
lancedb = ["lancedb"]
all = ["bs4", "pypdf", "tiktoken", "datasets", "qdrant_client", "psycopg2-binary", "sentry-sdk", "parea-ai", "boto3", "exa-py", "llama-cpp-python", "lancedb"]

[tool.poetry.group.dev.dependencies]
black = "^24.3.0"
Expand Down
2 changes: 1 addition & 1 deletion r2r/core/providers/vector_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def to_dict(self) -> dict:


class VectorDBProvider(ABC):
supported_providers = ["local", "pgvector", "qdrant"]
supported_providers = ["local", "pgvector", "qdrant", "lancedb"]

def __init__(self, provider: str):
if provider not in VectorDBProvider.supported_providers:
Expand Down
4 changes: 4 additions & 0 deletions r2r/main/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def get_vector_db(database_config: dict[str, Any]):
from r2r.vector_dbs import LocalVectorDB

return LocalVectorDB()
elif database_config["provider"] == "lancedb":
from r2r.vector_dbs import LanceDB

return LanceDB()

@staticmethod
def get_embeddings_provider(embedding_config: dict[str, Any]):
Expand Down
3 changes: 2 additions & 1 deletion r2r/vector_dbs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .local.base import LocalVectorDB
from .pg_vector.base import PGVectorDB
from .qdrant.base import QdrantDB
from .lancedb.base import LanceDB

__all__ = ["LocalVectorDB", "PGVectorDB", "QdrantDB"]
__all__ = ["LocalVectorDB", "PGVectorDB", "QdrantDB", "LanceDB"]
147 changes: 147 additions & 0 deletions r2r/vector_dbs/lancedb/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import logging
import os
from typing import Optional, Union

from r2r.core import VectorDBProvider, VectorEntry, VectorSearchResult

logger = logging.getLogger(__name__)


class LanceDB(VectorDBProvider):
def __init__(
self, provider: str = "lancedb", db_path: Optional[str] = None
) -> None:
logger.info("Initializing `LanceDB` to store and retrieve embeddings.")

super().__init__(provider)

if provider != "lancedb":
raise ValueError(
"LanceDB must be initialized with provider `lancedb`."
)

try:
import lancedb
except ImportError:
raise ValueError(
f"Error, `lancedb` is not installed. Please install it using `pip install lancedb`."
)

self.db_path = db_path
try:
self.client = lancedb.connect(uri=self.db_path or os.environ.get("LANCEDB_URI"), api_key=os.environ.get("LANCEDB_API_KEY") or None, region=os.environ.get("LANCEDB_REGION") or None)
except Exception as e:
raise ValueError(
f"Error {e} occurred while attempting to connect to the lancedb provider."
)
self.collection_name: Optional[str] = None

def initialize_collection(
self, collection_name: str, dimension: int
) -> None:
self.collection_name = collection_name

try:
import pyarrow
except ImportError:
raise ValueError(
f"Error, `pyarrow` is not installed. Please install it using `pip install pyarrow`."
)
sp6370 marked this conversation as resolved.
Show resolved Hide resolved

table_schema = pyarrow.schema(
[
pyarrow.field("id", pyarrow.string()),
pyarrow.field(
"vector", pyarrow.list_(pyarrow.float32(), dimension)
),
# TODO Handle storing metadata
Copy link
Contributor Author

@sp6370 sp6370 Apr 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AyushExel I need a column in the table to store metadata information associated with the vector.

Metadata has the following type:

MetadataValues = Union[str, int, float, bool, List[str]]
Metadata = Dict[str, MetadataValues]

How can I achieve this with Pyarrow/LanceDB?

]
)

try:
self.client.create_table(
name=f"{collection_name}",
on_bad_vectors="error",
schema=table_schema,
)
except Exception as e:
# TODO - Handle more appropriately - create collection fails when it already exists
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exception handling here could lead to silent failures which are hard to debug. Instead of simply passing when an exception occurs, consider logging the exception or re-raising it after logging.


def copy(self, entry: VectorEntry, commit=True) -> None:
raise NotImplementedError(
"LanceDB does not support the `copy` method."
)

def upsert(self, entry: VectorEntry, commit=True) -> None:
if self.collection_name is None:
raise ValueError(
"Please call `initialize_collection` before attempting to run `upsert`."
)
self.client.open_table(self.collection_name).add(
{
"vector": entry.vector,
"id": entry.id,
# TODO ADD metadata storage
},
mode="overwrite",
)

def upsert_entries(
self, entries: list[VectorEntry], commit: bool = True
) -> None:
if self.collection_name is None:
raise ValueError(
"Please call `initialize_collection` before attempting to run `upsert_entries`."
)

self.client.open_table(self.collection_name).add(
[{"vector": entry.vector,
"id": entry.id
# TODO ADD metadata storage
}
for entry in entries],
mode="overwrite",
)


def search(
self,
query_vector: list[float],
filters: dict[str, Union[bool, int, str]] = {},
limit: int = 10,
*args,
**kwargs,
) -> list[VectorSearchResult]:
if self.collection_name is None:
raise ValueError(
"Please call `initialize_collection` before attempting to run `search`."
)

results = self.client.open_table(self.collection_name).search(
query=query_vector,
# TODO implement metadata filter
).limit(limit).to_list()

return [
VectorSearchResult(
str(idx), result.get("_distance"), {} # TODO Handle metadata
)
for idx, result in enumerate(results)
]

def create_index(self, index_type, column_name, index_options):
pass

def close(self):
pass

def filtered_deletion(
self, key: str, value: Union[bool, int, str]
) -> None:
pass

def get_all_unique_values(
self, collection_name: str, metadata_field: str, filters: dict = {}
) -> list:
pass