Move from whitelisting parsers to blacklisting (#445)

* Move from whitelisting parsers to blacklisting * Check in * Update docs * Move from 415 to 403 * Readd, Ellipsis is whack
SciPhi-AI · Jun 13, 2024 · ce0665a · ce0665a
1 parent efff384
commit ce0665a
Show file tree

Hide file tree

Showing 14 changed files with 186 additions and 204 deletions.
diff --git a/config.json b/config.json
@@ -35,22 +35,8 @@
     }
   },
   "ingestion":{
-    "selected_parsers": {
-      "csv": "default",
-      "docx": "default",
-      "html": "default",
-      "json": "default",
-      "md": "default",
-      "pdf": "default",
-      "pptx": "default",
-      "txt": "default",
-      "xlsx": "default",
-      "gif": "default",
-      "png": "default",
-      "jpg": "default",
-      "jpeg": "default",
-      "svg": "default",
-      "mp3": "default"
+    "excluded_parsers": {
+      "mp4": "default"
     }
   },
   "logging": {

diff --git a/docs/pages/cookbooks/client-server.mdx b/docs/pages/cookbooks/client-server.mdx
@@ -11,21 +11,12 @@ This document extends the [R2R Demo](/getting-started/quickstart) by demonstrati
 To set up the R2R server, follow these steps:
 
 <Callout type="info" emoji="🐳">
-   Docker makes it convenient to run R2R without managing your local environment.
+   Docker makes it convenient to run R2R without managing your local environment. Learn more [here](/getting-started/installation)
 </Callout>
 
 <details>
-<summary>Docker Installation</summary>
-
-To run R2R using Docker, you can use the following commands:
-
-```bash filename="bash" copy
-docker pull emrgntcmplxty/r2r:latest
-```
-
-This will pull the latest R2R Docker image.
-
-Be sure to set an OpenAI API key in your environment and then run the container with:
+<summary>Running with Docker</summary>
+Run your Docker container using the following command:
 
 ```bash filename="bash" copy
 docker run -d \
@@ -57,6 +48,10 @@ This command starts the R2R container with the following options:
 
 The R2R framework includes a [python client](https://github.com/SciPhi-AI/R2R/blob/main/r2r/main/r2r_client.py) and a [typescript client](https://github.com/SciPhi-AI/r2r-js/blob/main/src/r2rClient.ts). All of the demo commands can be run with the python client using the `--client_server_mode` parameter to run the demo in Client-Server mode. Note that these commands will only run successfully if the server is active at http://0.0.0.0:8000.
 
+<Callout type="warning" emoji="️⚠️">
+   Be sure to run client commands in a new terminal!
+</Callout>
+
 ### Example Commands
 
 1. **Ingest Documents as Files**:

diff --git a/docs/pages/cookbooks/local-rag.mdx b/docs/pages/cookbooks/local-rag.mdx
@@ -5,24 +5,19 @@ import { Callout } from 'nextra/components'
 
 ### Installation
 
-<Tabs items={['Docker', 'Pip']}>
-
-
+We can install r2r with the necessary optional dependencies to run locally using `pip`
 
-<Tabs.Tab>
+```bash filename="bash" copy
+pip install 'r2r[local-embedding]'
+```
 
 <Callout type="info" emoji="🐳">
-   Docker makes it convenient to run R2R without managing your local environment.
+   Docker makes it convenient to run R2R without managing your local environment. Learn more [here](/getting-started/installation)
 </Callout>
 
-First, download the latest R2R image from Dockerhub:
-
-
-```bash filename="bash" copy
-docker pull emrgntcmplxty/r2r:latest
-```
-
-Then, run the service:
+<details>
+<summary>Running with Docker</summary>
+Run your Docker container using the following command:
 
 ```bash filename="bash" copy
 docker run -d \
@@ -42,23 +37,7 @@ This command starts the R2R container with the following options:
 - `-e OLLAMA_API_BASE=http://host.docker.internal:11434`: Specifies the Ollama API base URL.
 - `-e CONFIG_OPTION=local_ollama`: Selects the "local_ollama" configuration option.
 - `emrgntcmplxty/r2r:latest`: Specifies the Docker image to use.
-
-
-Lastly, install the R2R client using `pip`
-
-```bash filename="bash" copy
-pip install 'r2r'
-```
-
-</Tabs.Tab>
-
-<Tabs.Tab>
-
-We can install r2r with the necessary optional dependencies to run locally using `pip`
-
-```bash filename="bash" copy
-pip install 'r2r[local-embedding]'
-```
+</details>
 
 R2R supports  `Ollama`, a popular tool for Local LLM inference. Ollama is provided through a connection managed by the `litellm` library.
 
@@ -88,16 +67,14 @@ To streamline this process, we've provided pre-configured local settings in the
     }
   },
   "ingestion":{
-    "selected_parsers": {
-      "csv": "default",
-      "docx": "default",
-      "html": "default",
-      "json": "default",
-      "md": "default",
-      "pdf": "default",
-      "pptx": "default",
-      "txt": "default",
-      "xlsx": "default"
+    "excluded_parsers": {
+      "gif": "default",
+      "jpeg": "default",
+      "jpg": "default",
+      "png": "default",
+      "svg": "default",
+      "mp3": "default",
+      "mp4": "default"
     }
   }
 }
@@ -109,10 +86,6 @@ This chosen config modification above instructs R2R to use the `sentence-transfo
 
 A local vector database will be used to store the embeddings. The current default is a minimal sqlite implementation.
 
-</Tabs.Tab>
-
-</Tabs>
-
 
 ## Ingesting and Embedding Documents
 
@@ -121,7 +94,7 @@ With our environment set up and our server running in a separate process, we're
 Run this command to ingest the document:
 
 ```bash filename="bash" copy
-python -m r2r.examples.demo ingest_as_files --no-images=true --client_server_mode
+python -m r2r.examples.demo ingest_as_files --no-media=true --client_server_mode
 ```
 
 The output should look something like this:

diff --git a/docs/pages/deep-dive/app.mdx b/docs/pages/deep-dive/app.mdx
@@ -91,22 +91,9 @@ The application uses a configuration file (`config.json`) to set various setting
       "provider": "openai"
     }
   },
-  "ingestion": {
-    "selected_parsers": {
-      "csv": "default",
-      "docx": "default",
-      "html": "default",
-      "json": "default",
-      "md": "default",
-      "pdf": "default",
-      "pptx": "default",
-      "txt": "default",
-      "xlsx": "default",
-      "gif": "default",
-      "png": "default",
-      "jpg": "default",
-      "jpeg": "default",
-      "svg": "default"
+  "ingestion":{
+    "excluded_parsers": {
+      "mp4": "default"
     }
   },
   "logging": {
@@ -151,7 +138,7 @@ The available options for each section are:
     - `provider`: Provider to use (e.g., `"openai"`).
 
 - **ingestion**:
-  - `selected_parsers`: Specifies the parsers for different file types.
+  - `excluded_parsers`: Specifies the parsers to exclude for different file types.
     - `csv`: `"default"`.
     - `docx`: `"default"`.
     - `html`: `"default"`.
@@ -166,6 +153,8 @@ The available options for each section are:
     - `jpg`: `"default"`.
     - `jpeg`: `"default"`.
     - `svg`: `"default"`.
+    - `mp3`: `"default"`.
+    - `mp4`: `"default"`.
 
 - **logging**:
   - `provider`: `"local"`, `"postgres"`, `"redis"`.

diff --git a/docs/pages/deep-dive/config.mdx b/docs/pages/deep-dive/config.mdx
@@ -40,22 +40,9 @@ The default values for the config are shown below:
       "provider": "openai"
     }
   },
-  "ingestion": {
-    "selected_parsers": {
-      "csv": "default",
-      "docx": "default",
-      "html": "default",
-      "json": "default",
-      "md": "default",
-      "pdf": "default",
-      "pptx": "default",
-      "txt": "default",
-      "xlsx": "default",
-      "gif": "default",
-      "png": "default",
-      "jpg": "default",
-      "jpeg": "default",
-      "svg": "default"
+  "ingestion":{
+    "excluded_parsers": {
+      "mp4": "default"
     }
   },
   "logging": {
@@ -100,7 +87,7 @@ The available options for each section are:
     - `provider`: Provider to use (e.g., `"openai"`).
 
 - **ingestion**:
-  - `selected_parsers`: Specifies the parsers for different file types.
+  - `excluded_parsers`: Specifies the parsers to exclude for different file types.
     - `csv`: `"default"`.
     - `docx`: `"default"`.
     - `html`: `"default"`.
@@ -115,6 +102,8 @@ The available options for each section are:
     - `jpg`: `"default"`.
     - `jpeg`: `"default"`.
     - `svg`: `"default"`.
+    - `mp3`: `"default"`.
+    - `mp4`: `"default"`.
 
 - **logging**:
   - `provider`: `"local"`, `"postgres"`, `"redis"`.

diff --git a/docs/pages/getting-started/installation.mdx b/docs/pages/getting-started/installation.mdx
@@ -1,48 +1,57 @@
+import { Tabs } from 'nextra/components'
+
 ## Quick Installation
-<details open>
-<summary><b>Installing with Pip</b>&nbsp;🐍 </summary>
 
-Install R2R using `pip` to get started with minimal setup. This method will get you set up with the default configuration:
+<Tabs items={['Installing with Pip 🐍', 'Installing with Docker 🐳']}>
+  <Tabs.Tab>
+  Install R2R using `pip` to get started with minimal setup. This method will get you set up with the default configuration:
 
-```bash
-# use the `'r2r[all]'` to download all required deps
-pip install r2r
+  ```bash
+  # use the `'r2r[all]'` to download all required deps
+  pip install r2r
 
-# setup env 
-export OPENAI_API_KEY=sk-...
-```
-</details>
+  # setup env 
+  export OPENAI_API_KEY=sk-...
+  ```
+  </Tabs.Tab>
 
-<details>
-<summary><b>Installing with Docker</b>&nbsp;🐳</summary>
+  <Tabs.Tab>
+  Docker makes it convenient to run R2R without managing your local environment.
 
-To run R2R using Docker, you can use the following commands:
+  You'll first need to pull the latest R2R Docker image.
 
-```bash filename="bash" copy
-docker pull emrgntcmplxty/r2r:latest
-```
+  If you plan to use hosted LLMs, such as OpenAI models, run:
+  ```bash filename="bash" copy
+  docker pull emrgntcmplxty/r2r:v2.0.15_slim
+  ```
 
-This will pull the latest R2R Docker image.
+  This downloads the most recent Docker image and doesn't include optional dependencies for local LLMs which are quite large.
 
-Be sure to set an OpenAI API key in your environment and then run the container with:
+  If you plan to run local LLMs, pull the Docker image using the command:
+  ```bash filename="bash" copy
+  docker pull emrgntcmplxty/r2r:latest
+  ```
 
-```bash filename="bash" copy
-docker run -d \
-  --name r2r \
-  --add-host=host.docker.internal:host-gateway \
-  -p 8000:8000 \
-  -e OPENAI_API_KEY=$OPENAI_API_KEY \
-  emrgntcmplxty/r2r:latest
-```
+  Be sure to set an OpenAI API key in your environment and then run the container with:
 
-This command starts the R2R container with the following options:
+  ```bash filename="bash" copy
+  docker run -d \
+    --name r2r \
+    --add-host=host.docker.internal:host-gateway \
+    -p 8000:8000 \
+    -e OPENAI_API_KEY=$OPENAI_API_KEY \
+    emrgntcmplxty/r2r:latest
+  ```
 
-- `--name r2r`: Assigns the name "r2r" to the container.
-- `--add-host=host.docker.internal:host-gateway`: Adds a host entry for the Docker host.
-- `-p 8000:8000`: Maps port 8000 of the container to port 8000 of the host.
-- `-e OPENAI_API_KEY=$OPENAI_API_KEY`: Pulls your OpenAI API key from your local enviornment for use in the container.
-- `emrgntcmplxty/r2r:latest`: Specifies the Docker image to use.
-</details>
+  This command starts the R2R container with the following options:
+
+  - `--name r2r`: Assigns the name "r2r" to the container.
+  - `--add-host=host.docker.internal:host-gateway`: Adds a host entry for the Docker host.
+  - `-p 8000:8000`: Maps port 8000 of the container to port 8000 of the host.
+  - `-e OPENAI_API_KEY=$OPENAI_API_KEY`: Pulls your OpenAI API key from your local enviornment for use in the container.
+  - `emrgntcmplxty/r2r:latest`: Specifies the Docker image to use.
+  </Tabs.Tab>
+</Tabs>
 
 ## Full Installation
 

diff --git a/r2r/examples/configs/local_ollama.json b/r2r/examples/configs/local_ollama.json
@@ -13,16 +13,14 @@
     }
   },
   "ingestion":{
-    "selected_parsers": {
-      "csv": "default",
-      "docx": "default",
-      "html": "default",
-      "json": "default",
-      "md": "default",
-      "pdf": "default",
-      "pptx": "default",
-      "txt": "default",
-      "xlsx": "default"
+    "excluded_parsers": {
+      "gif": "default",
+      "jpeg": "default",
+      "jpg": "default",
+      "png": "default",
+      "svg": "default",
+      "mp3": "default",
+      "mp4": "default"
     }
   }
 }
diff --git a/r2r/examples/demo.py b/r2r/examples/demo.py
@@ -141,15 +141,16 @@ def update_as_documents(self, file_tuples: Optional[list[tuple]] = None):
         print(response)
 
     def ingest_as_files(
-        self, file_paths: Optional[list[str]] = None, no_images=False
+        self, file_paths: Optional[list[str]] = None, no_media=False
     ):
         file_paths = file_paths or self.default_files
 
-        if no_images:
+        if no_media:
+            excluded_types = ["jpeg", "jpg", "png", "svg", "mp3", "mp4"]
             file_paths = [
                 file_path
                 for file_path in file_paths
-                if file_path.split(".")[-1] != "png"
+                if file_path.split(".")[-1] not in excluded_types
             ]
 
         ids = [

diff --git a/r2r/examples/servers/configurable_pipeline.py b/r2r/examples/servers/configurable_pipeline.py
@@ -45,7 +45,6 @@ class PipelineType(Enum):
 def r2r_app(
     config_name: str = "default",
     pipeline_type: PipelineType = PipelineType.QNA,
-    no_images: bool = False,
 ) -> FastAPI:
     config_name = os.getenv("CONFIG_OPTION") or config_name