In [None]:
!pip install transformers elasticsearch

import numpy as np
from transformers import AutoTokenizer, AutoModel
from elasticsearch import Elasticsearch
import torch

# Define Elasticsearch connection with credentials
es = Elasticsearch(
    ['https://hostname:port'],
    http_auth=('username', 'password'),
    verify_certs=False
)

# Define the mapping for the dense vector field
mapping = {
    'properties': {
        'embedding': {
            'type': 'dense_vector',
            'dims': 768   # the number of dimensions of the dense vector
        }
    }
}

# Create an index with the defined mapping
es.indices.create(index='chapter-2', body={'mappings': mapping})

# Define a set of documents
docs = [
    {
        'title': 'Document 1',
        'text': 'This is the first document.'
    },
    {
        'title': 'Document 2',
        'text': 'This is the second document.'
    },
    {
        'title': 'Document 3',
        'text': 'This is the third document.'
    }
]

# Load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Generate embeddings for the documents using BERT
for doc in docs:
    text = doc['text']
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy()
    doc['embedding'] = output.tolist()

# Index the documents in Elasticsearch
for doc in docs:
    es.index(index='chapter-2', body=doc)
