# Text search for Wikipedia

## Setup

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["TOKENIZERS_PARALLELISM"]="true"
import tiledb
import numpy as np
from tiledb.vector_search.object_api import object_index
from tiledb.vector_search.embeddings import SentenceTransformersEmbedding
from tiledb.vector_search.object_readers import TileDB1DArrayReader

dataset = "wikipedia"
base_uri = f"/tmp/{dataset}_demo"
wikipedia_uri = f"{base_uri}/wikipedia_simple"
index_uri = f"{base_uri}/wikipedia_index"
vfs = tiledb.VFS()
if vfs.is_dir(base_uri):
    vfs.remove_dir(base_uri)
vfs.create_dir(base_uri)

## Save data to TileDB

Download and save wikipedia simple in a TileDB array

In [2]:
from datasets import load_dataset
import numpy as np
import tiledb

wiki_simple = load_dataset("wikipedia", "20220301.simple")
if vfs.is_dir(wikipedia_uri):
    vfs.remove_dir(wikipedia_uri)

array_schema = tiledb.ArraySchema(
    domain = tiledb.Domain([
        tiledb.Dim(name="external_id", dtype=np.uint64, domain=(0, np.iinfo(np.uint64).max-100000), tile=1000)
    ]),
    attrs=[
        tiledb.Attr(name="url", dtype=str),
        tiledb.Attr(name="title", dtype=str),
        tiledb.Attr(name="text", dtype=str),
    ],
    sparse=True
)
tiledb.Array.create(wikipedia_uri, array_schema)
size = wiki_simple["train"].num_rows
external_ids = np.zeros(size).astype(np.uint64)
urls = np.empty(size, dtype="O")
titles = np.empty(size, dtype="O")
texts = np.empty(size, dtype="O")
for i in range(size):
    external_ids[i] = i
    urls[i] = str(wiki_simple["train"][i]["url"])
    titles[i] = str(wiki_simple["train"][i]["title"])
    texts[i] = str(wiki_simple["train"][i]["text"])

with tiledb.open(wikipedia_uri, mode='w') as A:
    A[external_ids] = {"url": urls, "title": titles, "text": texts}

## Create vector search index

In [3]:
reader = TileDB1DArrayReader(uri=wikipedia_uri, metadata_uri=wikipedia_uri)
embedding = SentenceTransformersEmbedding(model_name_or_path='BAAI/bge-small-en-v1.5', dimensions=384)

if vfs.is_dir(index_uri):
    vfs.remove_dir(index_uri)

index = object_index.create(
    uri=index_uri,
    index_type="IVF_FLAT",
    object_reader=reader,
    embedding=embedding,
)
index.update_index(
    workers=2,
    max_tasks_per_stage=2,
    partition_tile_size=5000,
)

## Query

In [4]:
def display_results(results):
    urls = results["url"][0]
    texts = results["text"][0]
    i = 0
    for text in texts:
        print(f"url: {urls[i]}")
        i += 1

distances, results = index.query(
                            {"text": ["planet of the solar system"]}, 
                            k=20, 
                            nprobe=10,
                            return_metadata=False,
                        )
display_results(results)

url: https://simple.wikipedia.org/wiki/Solar%20System
url: https://simple.wikipedia.org/wiki/Planet
url: https://simple.wikipedia.org/wiki/Jupiter
url: https://simple.wikipedia.org/wiki/Earth
url: https://simple.wikipedia.org/wiki/Neptune
url: https://simple.wikipedia.org/wiki/Uranus
url: https://simple.wikipedia.org/wiki/Saturn
url: https://simple.wikipedia.org/wiki/Asteroid%20belt
url: https://simple.wikipedia.org/wiki/Mars
url: https://simple.wikipedia.org/wiki/Pluto%20%28disambiguation%29
url: https://simple.wikipedia.org/wiki/Venus
url: https://simple.wikipedia.org/wiki/Mercury%20%28planet%29
url: https://simple.wikipedia.org/wiki/Neptune%20%28disambiguation%29
url: https://simple.wikipedia.org/wiki/Uranus%20%28disambiguation%29
url: https://simple.wikipedia.org/wiki/Venus%20%28disambiguation%29
url: https://simple.wikipedia.org/wiki/Star
url: https://simple.wikipedia.org/wiki/Astronomy
url: https://simple.wikipedia.org/wiki/Asteroid
url: https://simple.wikipedia.org/wiki/Outer%20