# Document text search

## Setup

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["TOKENIZERS_PARALLELISM"]="true"
import tiledb
from tiledb.vector_search.object_api import object_index
from tiledb.vector_search.object_readers import DirectoryTextReader
from tiledb.vector_search.embeddings import SentenceTransformersEmbedding

dataset = "documents"
base_uri = f"/tmp/{dataset}_demo"
documents_uri = f"{base_uri}/documents"
index_uri = f"{base_uri}/index"
config = {}
vfs = tiledb.VFS(config=config)

# Create vector search index

We point to a document directory that contains multiple files of different types (.pdf, .docx, .html, .jpg, .png)

In [2]:
print(os.listdir(documents_uri))
print(os.listdir(f"{documents_uri}/blogs"))
print(os.listdir(f"{documents_uri}/img"))

['blogs', '.DS_Store', 'img', 'TileDB_Vector_Search_in_LangChain.docx', 'TileDB_Vector_Search_Updates.docx', 'VLDB17_TileDB.pdf']
['TileDB_Vector_Search_101.html', '.DS_Store']
['.DS_Store', 'TileDB_embedded_arch.png', 'TileDB_cloud_arch.jpg']


Create a vector index using an open source text embedding function from HuggingFace

In [None]:
if vfs.is_dir(index_uri):
    vfs.remove_dir(index_uri)
vfs.create_dir(index_uri)

reader = DirectoryTextReader(
        uri=documents_uri, 
        glob="**/[!.]*",
        config=config,
        text_splitter="RecursiveCharacterTextSplitter",
        text_splitter_kwargs={"chunk_size":1000}
    )
embedding = SentenceTransformersEmbedding(model_name_or_path='BAAI/bge-small-en-v1.5', dimensions=384)
index = object_index.create(
    uri=index_uri,
    index_type="IVF_FLAT",
    object_reader=reader,
    embedding=embedding,
    config=config,
)
index.update_index(
    files_per_partition=100,
    config=config,
)

## Query

Text similarity query with file type restrict

In [4]:
def display_results(results):
    file_paths = results["file_path"][0]
    texts = results["text"][0]
    i = 0
    for text in texts:
        print(f"File: {file_paths[i]}")
        print(f"Text: {text}")
        i += 1

def pdf_filter_fn(row):
    return ".pdf" in row['file_path']

distances, _, results = index.query(
                            {"text": ["sparse arrays"]}, 
                            metadata_df_filter_fn=pdf_filter_fn,
                            k=1,
                            nprobe=index.index.partitions,
                            return_objects=False,
                            return_metadata=True,
                        )
display_results(results)

File: file:///tmp/documents_demo/documents/VLDB17_TileDB.pdf
Text: 359

6.2 Sparse Arrays

We next focus on sparse arrays, comparing TileDB with Vertica+Z (gzip-compressed and following SRAM [19]) and SciDB on the AIS dataset. HDF5 is not optimized for sparse arrays, thus we omit it from these experiments.
