In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'WpOI-sfBSXe9aaVkWGIQnQ',
 'name': '61929d733ddf',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-12-16T10:09:08.849001802Z',
             'build_flavor': 'default',
             'build_hash': 'd8972a71dbbd64ff17f2f4dba9ca2c3fe09fb100',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.2',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.2.3'}}


In [2]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index', mappings={
            'properties': {
                'embedding': {
                    'type': 'dense_vector',
                }
            }
        })

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 700.21it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
import json
with open('../data/astronomy.json', 'r') as f:
    documents = json.load(f)

documents[0]

{'id': 1,
 'title': 'The Solar System',
 'content': 'The Solar System consists of the Sun and the objects that orbit it, including eight planets, their moons, dwarf planets, and countless small bodies like asteroids and comets.'}

In [6]:

operations = []
for doc in documents:
    operations.append(
        {'index': {'_index': 'my_index'}}
        )
    operations.append(
        {
            **doc,
            'embedding': model.encode(doc['content'])
        }
    )

response = es.bulk(operations=operations)
pprint(response.body)

{'errors': False,
 'items': [{'index': {'_id': '9TSAA5wByhNgcdF6PI9B',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '9jSAA5wByhNgcdF6PI9B',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '9zSAA5wByhNgcdF6PI9B',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, '

In [7]:
response = es.search(
    index='my_index',
    body={
        'query':
            {
                'match_all': {}
            }
    }
)

pprint(response["hits"]["total"])

{'relation': 'eq', 'value': 10}


In [8]:
# Since the dense_vector are not returned by default, we need to explicitly request them
response = es.search(
    index='my_index',
    body={
        "fields" : ["embedding"]
    }
)

response.body['hits']['hits'][0]['fields']

{'embedding': [0.04063337,
  -0.002561739,
  0.054834742,
  0.009171084,
  0.031219983,
  -0.01448789,
  0.029695772,
  0.016134977,
  0.06336765,
  0.045461554,
  -0.0039837295,
  -0.05447261,
  0.048416253,
  -0.08368916,
  0.07394564,
  -0.062437825,
  0.021085346,
  0.010841272,
  0.028692521,
  0.007015274,
  -0.0070978007,
  0.046943072,
  -0.037410237,
  0.039965186,
  -0.07640924,
  0.07742979,
  -0.038383465,
  -0.06901815,
  -0.08802214,
  -0.052870747,
  0.017453754,
  0.068147555,
  0.04107507,
  0.027422281,
  0.023659457,
  0.05961626,
  -0.008039369,
  -0.057832308,
  -0.0035527064,
  0.00015244957,
  -0.0154978065,
  -0.03561315,
  0.040230557,
  0.027971894,
  0.020365497,
  -0.014169318,
  -0.06097956,
  -0.0863104,
  0.08755852,
  -0.03343516,
  0.015291126,
  0.0018471811,
  0.02236575,
  0.11882254,
  0.0032985015,
  -0.06849946,
  -0.033059202,
  -0.022321826,
  0.019228254,
  -0.06985603,
  -0.0021125656,
  -0.048423633,
  -0.0017212693,
  0.03360909,
  -0.030379

In [9]:
response = es.indices.get_mapping(index='my_index')
pprint(response.body)

{'my_index': {'mappings': {'properties': {'content': {'fields': {'keyword': {'ignore_above': 256,
                                                                             'type': 'keyword'}},
                                                      'type': 'text'},
                                          'embedding': {'dims': 384,
                                                        'index': True,
                                                        'index_options': {'ef_construction': 100,
                                                                          'm': 16,
                                                                          'rescore_vector': {'oversample': 3.0},
                                                                          'type': 'bbq_hnsw'},
                                                        'similarity': 'cosine',
                                                        'type': 'dense_vector'},
                                          '

In [14]:
query = "What is a black hole?"
query_vector = model.encode(query)
results = es.search(
    index='my_index',
    body={
        "size": 3,
        "knn" : {
            "field" : "embedding",
            "query_vector" : query_vector,
            "k" : 3, # Top K similar vectors
            "num_candidates": 5 # Number of candidates to consider for KNN search
        }
    }
)

results.body['hits']['hits']

[{'_index': 'my_index',
  '_id': '9jSAA5wByhNgcdF6PI9B',
  '_score': 0.8863715,
  '_source': {'id': 2,
   'title': 'Black Holes',
   'content': 'A black hole is a region of space where the gravitational pull is so strong that nothing, not even light, can escape from it. They are formed when massive stars collapse under their own gravity.'}},
 {'_index': 'my_index',
  '_id': '_DSAA5wByhNgcdF6PI9B',
  '_score': 0.66036683,
  '_source': {'id': 8,
   'title': 'Dark Matter',
   'content': "Dark matter is a type of matter that does not emit light or energy. It cannot be observed directly but is believed to make up about 27% of the universe's total mass and energy."}},
 {'_index': 'my_index',
  '_id': '9zSAA5wByhNgcdF6PI9B',
  '_score': 0.6420158,
  '_source': {'id': 3,
   'title': 'Galaxies',
   'content': 'Galaxies are vast systems that consist of stars, stellar remnants, interstellar gas, dust, and dark matter. The Milky Way is the galaxy that contains our Solar System.'}}]

In [16]:
query = "How do we find exoplanets?"
query_vector = model.encode(query)
result = es.search(
    index='my_index',
    knn={
        "field": "embedding",
        "query_vector": query_vector,
        "num_candidates": 5,
        "k": 1,
    }
)

result.body['hits']['hits']

[{'_index': 'my_index',
  '_id': '-TSAA5wByhNgcdF6PI9B',
  '_score': 0.8561732,
  '_source': {'id': 5,
   'title': 'Exoplanets',
   'content': 'Exoplanets, or extrasolar planets, are planets that exist outside our solar system. They vary greatly in size and composition and are often found using methods like the transit method and radial velocity.'}}]