SciPhi-AI · raghavdixit99 · May 3, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/.env.example b/.env.example
@@ -1,7 +1,7 @@
 # DB Providers
 
 ## Local
-LOCAL_DB_PATH=local.sqlite
+LOCAL_DB_PATH=lancedb
 
 # ## Postgres
 # ## For example setup, see https://supabase.com/dashboard/project/MY_PROJECT/settings/database
@@ -16,6 +16,12 @@ LOCAL_DB_PATH=local.sqlite
 ## QDRANT_PORT=your_qdrant_port
 ## QDRANT_API_KEY=your_qdrant_api_key
 
+# ## lancedb
+LANCEDB_URI='/tmp/lancedb'
+## for lancedb cloud :
+## LANCEDB_API_KEY=sk-your_API_key
+## LANCEDB_REGION=the_region_you_set
+
 # LLM Providers
 
 ## openai

diff --git a/config.json b/config.json
@@ -42,7 +42,7 @@
     "provider": "local"
   },
   "vector_database": {
-    "provider": "local",
+    "provider": "lancedb",
     "collection_name": "demo_vecs"
   }
 }
diff --git a/lancedb b/lancedb
diff --git a/r2r/core/providers/vector_db.py b/r2r/core/providers/vector_db.py
@@ -19,7 +19,7 @@ def validate(self) -> None:
 
     @property
     def supported_providers(self) -> list[str]:
-        return ["local", "pgvector", "qdrant"]
+        return ["local", "pgvector", "qdrant", "lancedb"]
 
 
 class VectorDBProvider(Provider, ABC):

diff --git a/r2r/examples/clients/run_test_client.py b/r2r/examples/clients/run_test_client.py
@@ -4,6 +4,7 @@
 from r2r.client import R2RClient
 from r2r.core.utils import generate_id_from_label
 
+
 # Initialize the client with the base URL of your API
 base_url = "http://localhost:8000"
 client = R2RClient(base_url)
@@ -53,83 +54,86 @@
 print(f"Search response:\n{search_response}\n\n")
 
 print("Searching remote vector db with filter...")
+
 # Perform a search w/ filter
-filtered_search_response = client.search("test", 5, filters={"tags": "bulk"})
+filtered_search_response = client.search("test", 5, filters={'metadata':{"tags": "example"}})
 print(f"Search response w/ filter:\n{filtered_search_response}\n\n")
 
 print("Deleting sample document in remote vector db...")
-# Delete a document
-response = client.filtered_deletion(
-    "document_id", generate_id_from_label("doc 2")
-)
-print(f"Deletion response:\n{response}\n\n")
-
-print("Searching remote vector db with filter after deletion...")
-# Perform a search w/ filter after deletion
-post_deletion_filtered_search_response = client.search(
-    "test", 5, filters={"tags": "bulk"}
-)
-print(
-    f"Search response w/ filter+deletion:\n{post_deletion_filtered_search_response}\n\n"
-)
-
-# Example file path for upload
-# get file directory
-current_file_directory = os.path.dirname(os.path.realpath(__file__))
-
-file_path = os.path.join(current_file_directory, "..", "data", "test.pdf")
-
-print(f"Uploading and processing file: {file_path}...")
-# # Upload and process a file
-metadata = {"tags": ["example", "test"]}
-upload_pdf_response = client.upload_and_process_file(
-    generate_id_from_label("pdf 1"), file_path, metadata, None
-)
-print(f"Upload test pdf response:\n{upload_pdf_response}\n\n")
-
-print("Searching remote vector db after upload...")
-# Perform a search on this file
-pdf_filtered_search_response = client.search(
-    "what is a cool physics equation?",
-    5,
-    filters={"document_id": generate_id_from_label("pdf 1")},
-)
-print(
-    f"Search response w/ uploaded pdf filter:\n{pdf_filtered_search_response}\n"
-)
-
-
-print("Performing RAG...")
-# Perform a search on this file
-pdf_filtered_search_response = client.rag_completion(
-    "Are there any test documents?",
-    5,
-    filters={"document_id": generate_id_from_label("pdf 1")},
-)
-print(
-    f"Search response w/ uploaded pdf filter:\n{pdf_filtered_search_response}\n"
-)
-
-print("Performing RAG with streaming...")
-
-
-# Perform a RAG completion with streaming
-async def stream_rag_completion():
-    async for chunk in client.stream_rag_completion(
-        "Are there any test documents?",
-        5,
-        filters={"document_id": generate_id_from_label("pdf 1")},
-        generation_config={"stream": True},
-    ):
-        print(chunk, end="", flush=True)
-
-
-asyncio.run(stream_rag_completion())
-
-print("Fetching logs after all steps...")
-logs_response = client.get_logs()
-print(f"Logs response:\n{logs_response}\n")
 
-print("Fetching logs summary after all steps...")
-logs_summary_response = client.get_logs_summary()
-print(f"Logs summary response:\n{logs_summary_response}\n")
+# # Delete a document
+# response = client.filtered_deletion(
+#     "document_id", generate_id_from_label("doc 2")
+# )
+# print(f"Deletion response:\n{response}\n\n")
+
+# print("Searching remote vector db with filter after deletion...")
+# # Perform a search w/ filter after deletion
+# post_deletion_filtered_search_response = client.search(
+#     "test", 5, filters={"tags": "bulk"}
+# )
+# print(
+#     f"Search response w/ filter+deletion:\n{post_deletion_filtered_search_response}\n\n"
+# )
+
+
+# # Example file path for upload
+# # get file directory
+# current_file_directory = os.path.dirname(os.path.realpath(__file__))
+
+# file_path = os.path.join(current_file_directory, "..", "data", "test.pdf")
+
+# print(f"Uploading and processing file: {file_path}...")
+# # # Upload and process a file
+# metadata = {"tags": ["example", "test"]}
+# upload_pdf_response = client.upload_and_process_file(
+#     generate_id_from_label("pdf 1"), file_path, metadata, None
+# )
+# print(f"Upload test pdf response:\n{upload_pdf_response}\n\n")
+
+# print("Searching remote vector db after upload...")
+# # Perform a search on this file
+# pdf_filtered_search_response = client.search(
+#     "what is a cool physics equation?",
+#     5,
+#     filters={"document_id": generate_id_from_label("pdf 1")},
+# )
+# print(
+#     f"Search response w/ uploaded pdf filter:\n{pdf_filtered_search_response}\n"
+# )
+
+
+# print("Performing RAG...")
+# # Perform a search on this file
+# pdf_filtered_search_response = client.rag_completion(
+#     "Are there any test documents?",
+#     5,
+#     filters={"document_id": generate_id_from_label("pdf 1")},
+# )
+# print(
+#     f"Search response w/ uploaded pdf filter:\n{pdf_filtered_search_response}\n"
+# )
+
+# print("Performing RAG with streaming...")
+
+
+# # Perform a RAG completion with streaming
+# async def stream_rag_completion():
+#     async for chunk in client.stream_rag_completion(
+#         "Are there any test documents?",
+#         5,
+#         filters={"document_id": generate_id_from_label("pdf 1")},
+#         generation_config={"stream": True},
+#     ):
+#         print(chunk, end="", flush=True)
+
+
+# asyncio.run(stream_rag_completion())
+
+# print("Fetching logs after all steps...")
+# logs_response = client.get_logs()
+# print(f"Logs response:\n{logs_response}\n")
+
+# print("Fetching logs summary after all steps...")
+# logs_summary_response = client.get_logs_summary()
+# print(f"Logs summary response:\n{logs_summary_response}\n")
diff --git a/r2r/examples/configs/local_ollama_lancedb.json b/r2r/examples/configs/local_ollama_lancedb.json
@@ -0,0 +1,36 @@
+{
+    "language_model": {
+      "provider": "litellm"
+    },
+    "vector_database": {
+      "provider": "lancedb",
+      "collection_name": "demo_vecs"
+    },
+    "ingestion":{
+      "provider": "local"
+    },
+    "embedding": {
+      "provider": "sentence-transformers",
+      "search_model": "mixedbread-ai/mxbai-embed-large-v1",
+      "search_dimension": 512,
+      "batch_size": 32,
+      "text_splitter": {
+        "type": "recursive_character",
+        "chunk_size": 512,
+        "chunk_overlap": 20
+      }
+    },
+    "evals": {
+      "provider": "parea",
+      "frequency": 0.0
+    },
+    "app": {
+      "max_logs": 100,
+      "max_file_size_in_mb": 100
+    },
+    "logging_database": {
+      "provider": "local",
+      "collection_name": "demo_logs",
+      "level": "INFO"
+    }
+  }
diff --git a/r2r/examples/servers/config_pipeline.py b/r2r/examples/servers/config_pipeline.py
@@ -16,10 +16,9 @@
 
 CONFIG_OPTIONS = {
     "default": None,
+    "local_ollama_lancedb": os.path.join(configs_path, "local_ollama_lancedb.json"),
     "local_ollama": os.path.join(configs_path, "local_ollama.json"),
-    "local_ollama_qdrant": os.path.join(
-        configs_path, "local_ollama_qdrant.json"
-    ),
+    "local_ollama_qdrant": os.path.join(configs_path, "local_ollama_qdrant.json"),
     "local_ollama_with_rerank": os.path.join(
         configs_path, "local_ollama_with_rerank.json"
     ),
@@ -34,17 +33,23 @@
 }
 
 
+
 def create_app(config_name: str = "default", pipeline_name: str = "qna"):
     config_name = os.getenv("CONFIG_OPTION") or config_name
     pipeline_name = os.getenv("PIPELINE_OPTION") or pipeline_name
 
     config_path = CONFIG_OPTIONS[config_name]
     pipeline_impl = PIPELINE_OPTIONS[pipeline_name]
 
+    # delete past dataset for testing
+    if os.path.exists(os.getenv('LANCEDB_URI')):
+            os.remove(os.getenv('LANCEDB_URI'))
+
     app = E2EPipelineFactory.create_pipeline(
         config=R2RConfig.load_config(config_path),
         rag_pipeline_impl=pipeline_impl,
     )
+    print('app created...')
     return app