In [1]:
# Creating tables and adding data
from lancedb.pydantic import vector, LanceModel

class CatsAndDogs(LanceModel):
    vector: vector(2) # type: ignore
    species: str
    breed: str
    weight: float

In [None]:
import lancedb
db = lancedb.connect(".lancedb")
table_name = "cats_and_dogs"
db.drop_table(table_name, ignore_missing=True)
table = db.create_table(table_name, schema=CatsAndDogs)

In [4]:
# Add some data to the db
data = [
    CatsAndDogs(
        vector=[1., 0.],
        species="cat",
        breed="shorthair",
        weight=12.,
    ),
    CatsAndDogs(
        vector=[-1., 0.],
        species="cat",
        breed="himalayan",
        weight=9.5,
    ),
]

In [5]:
# Now add all the data to the db (lancdDb)
table.add([dict(d) for d in data])

AddResult(version=2)

In [6]:
table.head().to_pandas()

Unnamed: 0,vector,species,breed,weight
0,"[1.0, 0.0]",cat,shorthair,12.0
1,"[-1.0, 0.0]",cat,himalayan,9.5


In [7]:
data = [
    CatsAndDogs(
        vector=[0., 10.],
        species="dog",
        breed="samoyed",
        weight=47.5,
    ),
    CatsAndDogs(
        vector=[0, -1.],
        species="dog",
        breed="corgi",
        weight=26.,
    )
]

In [8]:
table.add([dict(d) for d in data])

AddResult(version=3)

In [11]:
table.head().to_pandas()

Unnamed: 0,vector,species,breed,weight
0,"[1.0, 0.0]",cat,shorthair,12.0
1,"[-1.0, 0.0]",cat,himalayan,9.5
2,"[0.0, 10.0]",dog,samoyed,47.5
3,"[0.0, -1.0]",dog,corgi,26.0


In [13]:
# Querying the tables to fetch the data
# Example an animal with an embedding [10.5, 10.], fetch the nearest animal in the db
vec = [10.5, 10.]
table.search(vec).limit(1).to_pandas()

Unnamed: 0,vector,species,breed,weight,_distance
0,"[0.0, 10.0]",dog,samoyed,47.5,110.25


In [14]:
# Use cosine distance to fetch the animal
table.search(vec).metric("cosine").limit(1).to_pandas()

Unnamed: 0,vector,species,breed,weight,_distance
0,"[1.0, 0.0]",cat,shorthair,12.0,0.275862


In [18]:
table.search(vec).limit(2).where("species='cat'").to_pandas()

Unnamed: 0,vector,species,breed,weight,_distance
0,"[1.0, 0.0]",cat,shorthair,12.0,190.25
1,"[-1.0, 0.0]",cat,himalayan,9.5,232.25


#### Creating ANN indices

For larger tables (e.g., >1M rows), searching through all of the vectors becomes quite slow. Here is where the Approximate Nearest Neighbor (ANN) index comes into play. While there are many different ANN indexing algorithms, they all have the same purpose - to drastically limit the search space as much as possible while losing as little accuracy as possible

In [19]:
from lance.vector import vec_to_table
import numpy as np

mat = np.random.randn(100_000, 16)
table_name = "ann_exercise"
db.drop_table(table_name, ignore_missing=True)
table = db.create_table(table_name, vec_to_table(mat))

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
query = np.random.randn(16)
table.search(query).limit(10).to_pandas()

Unnamed: 0,vector,_distance
0,"[-0.3358078, 0.42406148, -0.57194287, -0.25900...",4.07689
1,"[0.49186453, -0.80709034, 0.7919387, -0.197119...",4.685083
2,"[0.04031391, 0.09748768, 0.30579472, -0.157306...",5.24686
3,"[0.61035883, -0.046675533, 0.4486289, 0.174424...",5.332796
4,"[-0.4613902, -0.032966405, -0.4846418, -0.3886...",5.779507
5,"[-0.28218934, -0.100541055, 0.5607945, -0.2647...",5.833094
6,"[0.48181796, -0.5484441, -0.3380575, 0.8727336...",5.859091
7,"[0.64952034, 1.3679411, 0.36006668, 0.7065148,...",5.870637
8,"[-0.3884268, 0.42994508, -0.39320529, 0.623682...",6.086878
9,"[1.0204532, 0.55047584, 0.111123614, 0.2338016...",6.148514


In [21]:
%timeit table.search(np.random.randn(16)).limit(10).to_arrow()

11.1 ms ± 382 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
# Now create an index
# we'll create an IVFPQ index (partition-based index with product quantization compression) using LanceDB.

# Create an IVFPQ index on the LanceDB table such that each partition is 4000 rows and each PQ subvector is 8D.

# Total vectors / number of partitions = number of vectors in each partition
# Total dimensions / number of subvectors = number of dimensions in each subvector

table.create_index(num_partitions=16, num_sub_vectors=8)

In [23]:
table.search(query).limit(10).to_pandas()

Unnamed: 0,vector,_distance
0,"[-0.3358078, 0.42406148, -0.57194287, -0.25900...",3.82304
1,"[0.61035883, -0.046675533, 0.4486289, 0.174424...",4.994966
2,"[0.49186453, -0.80709034, 0.7919387, -0.197119...",5.012519
3,"[0.04031391, 0.09748768, 0.30579472, -0.157306...",5.085307
4,"[-0.4613902, -0.032966405, -0.4846418, -0.3886...",5.582158
5,"[0.64952034, 1.3679411, 0.36006668, 0.7065148,...",5.686636
6,"[-0.28218934, -0.100541055, 0.5607945, -0.2647...",5.731027
7,"[-0.916247, 0.105998054, 0.39105806, 0.0668836...",5.899779
8,"[1.0204532, 0.55047584, 0.111123614, 0.2338016...",5.972323
9,"[0.27626112, -0.14657064, 0.43439966, -0.71540...",6.265519


In [24]:
%timeit table.search(np.random.randn(16)).limit(10).to_arrow()

5.61 ms ± 429 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
# Deleting rows
table = db["cats_and_dogs"]
len(table)

4

In [26]:
table.delete("species='cat'")
len(table)

2

In [27]:
# Version control
table.list_versions()

[{'version': 1,
  'timestamp': datetime.datetime(2025, 11, 7, 23, 42, 20, 672864),
  'metadata': {'total_data_file_rows': '0',
   'total_data_files': '0',
   'total_deletion_file_rows': '0',
   'total_deletion_files': '0',
   'total_files_size': '0',
   'total_fragments': '0',
   'total_rows': '0'}},
 {'version': 2,
  'timestamp': datetime.datetime(2025, 11, 7, 23, 43, 49, 164513),
  'metadata': {'total_data_file_rows': '2',
   'total_data_files': '1',
   'total_deletion_file_rows': '0',
   'total_deletion_files': '0',
   'total_files_size': '1166',
   'total_fragments': '1',
   'total_rows': '2'}},
 {'version': 3,
  'timestamp': datetime.datetime(2025, 11, 7, 23, 44, 35, 813640),
  'metadata': {'total_data_file_rows': '4',
   'total_data_files': '2',
   'total_deletion_file_rows': '0',
   'total_deletion_files': '0',
   'total_files_size': '2332',
   'total_fragments': '2',
   'total_rows': '4'}},
 {'version': 4,
  'timestamp': datetime.datetime(2025, 11, 8, 0, 1, 2, 4707),
  'metadat

In [28]:
table = db["cats_and_dogs"]
len(table)

2

In [29]:
table.restore(3)

In [30]:
len(table)

4

In [31]:
table.delete("species='dog'")
table.list_versions()

[{'version': 1,
  'timestamp': datetime.datetime(2025, 11, 7, 23, 42, 20, 672864),
  'metadata': {'total_data_file_rows': '0',
   'total_data_files': '0',
   'total_deletion_file_rows': '0',
   'total_deletion_files': '0',
   'total_files_size': '0',
   'total_fragments': '0',
   'total_rows': '0'}},
 {'version': 2,
  'timestamp': datetime.datetime(2025, 11, 7, 23, 43, 49, 164513),
  'metadata': {'total_data_file_rows': '2',
   'total_data_files': '1',
   'total_deletion_file_rows': '0',
   'total_deletion_files': '0',
   'total_files_size': '1166',
   'total_fragments': '1',
   'total_rows': '2'}},
 {'version': 3,
  'timestamp': datetime.datetime(2025, 11, 7, 23, 44, 35, 813640),
  'metadata': {'total_data_file_rows': '4',
   'total_data_files': '2',
   'total_deletion_file_rows': '0',
   'total_deletion_files': '0',
   'total_files_size': '2332',
   'total_fragments': '2',
   'total_rows': '4'}},
 {'version': 4,
  'timestamp': datetime.datetime(2025, 11, 8, 0, 1, 2, 4707),
  'metadat

In [32]:
table.to_pandas()

Unnamed: 0,vector,species,breed,weight
0,"[1.0, 0.0]",cat,shorthair,12.0
1,"[-1.0, 0.0]",cat,himalayan,9.5


In [33]:
# Dropping a table
"cats_and_dogs" in db

True

In [34]:
db.drop_table("cats_and_dogs")

In [35]:
table.name in db

False