In [8]:
"""
📌 DuckDB + Jupyter Notebook (Ejecutando dentro de Docker)
- Permite consultas rápidas sobre archivos JSONL comprimidos dentro de Docker.
- Se conecta a DuckDB y ejecuta SQL directamente.
- Muestra resultados en un dataframe de Pandas.
"""

import duckdb
import pandas as pd

# 📍 Ruta dentro del contenedor Docker (asegúrate de montar el volumen correctamente)
json_path = "/data/elasticsearch/aarecords__0.json.gz"

# 🚀 Conectar a DuckDB
db = duckdb.connect()

# 🔍 Ejecutar la consulta optimizada
query = f"""
    SELECT
        json_extract_string(json, '$._id') AS id,
        TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_isbn13[0]') AS BIGINT) - 978000000000 AS isbn_pos,
        (TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_isbn13[0]') AS BIGINT) - 978000000000) // 15625000 AS isbn_bucket,
        json_extract_string(json, '$._source.search_only_fields.search_title') AS title,
        json_extract_string(json, '$._source.search_only_fields.search_author') AS author,
        json_extract_string(json, '$._source.search_only_fields.search_publisher') AS publisher,
        TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_year') AS INTEGER) AS publish_year,
        json_extract_string(json, '$._source.file_unified_data.cover_url_best') AS cover_url,
        json_extract(json, '$._source.search_only_fields.search_record_sources') AS sources,
        (
            json_extract(json, '$._source.file_unified_data.has_aa_downloads') = 'true'
            OR json_extract(json, '$._source.file_unified_data.has_aa_exclusive_downloads') = 'true'
            OR json_extract(json, '$._source.file_unified_data.has_torrent_paths') = 'true'
        ) AS is_archived,
        (
            json_extract(json, '$._source.file_unified_data.classifications_unified.oclc_holdings') IS NOT NULL
            AND json_array_length(json_extract(json, '$._source.file_unified_data.classifications_unified.oclc_holdings')) > 0
        ) AS is_rare
    FROM read_ndjson_objects(['{json_path}'])
    LIMIT 10;
"""

# 🔥 Ejecutar la consulta y mostrar el resultado
df = db.execute(query).fetchdf()
display(df)


Unnamed: 0,name


In [None]:
!ls /data/
