#### Elasticsearch

In [1]:
pip install elasticsearch

Note: you may need to restart the kernel to use updated packages.


In [10]:
# !pip3 install pandas

In [5]:
from elasticsearch import Elasticsearch
import json
import os

client = Elasticsearch("http://elasticsearch:9200")

settings = {
	"settings":{
		"number_of_shards":1,
		"number_of_replicas":0
	}
}
client.indices.create(index="my_index", body=settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [4]:
# Para ver los índices disponibles
indices = client.indices.get_alias(index="*")

# Mostrar los nombres de los índices
print("Índices disponibles:")
for index in indices:
    print(index)

Índices disponibles:


In [3]:
# Para borrar un índice 
client.indices.delete(index="my_index")

ObjectApiResponse({'acknowledged': True})

In [16]:
from elasticsearch.helpers import bulk

def importar_jsons_campo_especifico(carpeta_json, campos_deseados, batch_size=1000):
    # carpetas_json: carpeta donde se encuentran los archivos JSON
    # campos_deseados: campos que se desean indexar
    # batch_size: tamaño del lote para indexar
    total_documents = 0
    my_documents = []
    for carpeta_json in carpeta_json:
        for filename in os.listdir(carpeta_json):
            if filename.endswith('.json'):
                with open(os.path.join(carpeta_json, filename), 'r') as f:
                    data = json.load(f)
                    # Selecciona solo los campos deseados
                    documento = {campo: data.get(campo, None) for campo in campos_deseados}
                    # Agrega el documento a la lista de documentos
                    my_documents.append({"_index": 'my_index', "_source": documento})
                    total_documents += 1
                    # Indexa por lotes
                    if total_documents % batch_size == 0:
                        print("Indexando", total_documents, "documentos")
                        bulk(client, my_documents)
                        my_documents = []
    # Indexa los documentos restantes
    if my_documents:
        print("Indexando", total_documents, "documentos")
        bulk(client, my_documents)

In [17]:
carpeta_json = ['work/json/'] # Carpeta donde se encuentran los JSONs
campos_deseados = ['author_name', 'first_publish_year', 'title', 'type', 'isbn', 'ebook_access']

In [19]:
# Para contar cuántos documentos JSON hay en la carpeta 'work/json/'

def contar_documentos_json(carpetas_json):
    total_documentos = 0
    for carpeta_json in carpetas_json:
        for filename in os.listdir(carpeta_json):
            if filename.endswith('.json'):
                total_documentos += 1
    return total_documentos

carpetas_json = ['work/json/']  # Carpeta donde se encuentran los JSONs
total_documentos = contar_documentos_json(carpetas_json)
print("Total de documentos JSON en la carpeta:", total_documentos)

Total de documentos JSON en la carpeta: 33693


In [18]:
importar_jsons_campo_especifico(carpeta_json, campos_deseados)

Indexando 100 documentos
Indexando 200 documentos
Indexando 300 documentos
Indexando 400 documentos


In [10]:
# Consulta de todos los documentos para verificar que se han importado correctamente
response = client.search(index="my_index", body={"query": {"match_all": {}}})
print("Documentos en el índice:")
for hit in response['hits']['hits']:
    print(hit['_source'])

Documentos en el índice:
{'author_name': ['Changjak'], 'first_publish_year': 2013, 'title': '0.0 MHz', 'type': 'work', 'isbn': ['9788967970123', '8967970129'], 'ebook_access': 'no_ebook'}
{'author_name': ['Paul D. Storrie'], 'first_publish_year': 2011, 'title': '#02 Made for Each Other', 'type': 'work', 'isbn': ['1282961632', '9780761376040', '9781282961630', '0761376046'], 'ebook_access': 'no_ebook'}
{'author_name': ['Thomas Opdahl'], 'first_publish_year': 2023, 'title': '045 Dis-Charges Horror in the Prairie - Movie Script EBOOK', 'type': 'work', 'isbn': ['9781312269408', '1312269405'], 'ebook_access': 'no_ebook'}
{'author_name': ['Yuki Amemiya'], 'first_publish_year': 2013, 'title': '07-ghost', 'type': 'work', 'isbn': ['9781421549996', '1421549999'], 'ebook_access': 'no_ebook'}
{'author_name': ['Peter Galarneau Jr.'], 'first_publish_year': 2010, 'title': '0-Time', 'type': 'work', 'isbn': ['9780982512937', '0982512937'], 'ebook_access': 'no_ebook'}
{'author_name': ['Zander'], 'first_