# Basic DuckDB Queries

This notebook demonstrates basic DuckDB functionality for analyzing JSON data.

## Setup and Imports

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt

# Connect to DuckDB (in-memory database)
conn = duckdb.connect(database=':memory:', read_only=False)
print("âœ… Connected to DuckDB")

## 1. Basic Record Query

This query extracts basic book information from the JSON data:

In [None]:
basic_query = """
SELECT
    json_extract_string(json, '$._id') AS id,
    json_extract_string(json, '$._source.search_only_fields.search_title') AS title,
    json_extract_string(json, '$._source.search_only_fields.search_author') AS author,
    TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_year') AS INTEGER) AS publish_year
FROM read_ndjson_objects('/data/elasticsearch/*.json.gz')
LIMIT 10;
"""

df_basic = conn.execute(basic_query).fetchdf()
display(df_basic)

## 2. Publication Year Analysis

Count publications by year:

In [None]:
year_query = """
SELECT
    COUNT(*) AS count,
    TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_year') AS INTEGER) AS publish_year
FROM read_ndjson_objects('/data/elasticsearch/*.json.gz')
WHERE publish_year IS NOT NULL
GROUP BY publish_year
ORDER BY publish_year DESC
LIMIT 20;
"""

df_years = conn.execute(year_query).fetchdf()

# Create bar plot
plt.figure(figsize=(12, 6))
plt.bar(df_years['publish_year'], df_years['count'])
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.show()

## 3. Advanced Record Query

This query includes additional fields and calculated values:

In [None]:
advanced_query = """
SELECT
    json_extract_string(json, '$._id') AS id,
    TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_isbn13[0]') AS BIGINT) - 978000000000 AS isbn_pos,
    json_extract_string(json, '$._source.search_only_fields.search_title') AS title,
    json_extract_string(json, '$._source.search_only_fields.search_author') AS author,
    json_extract_string(json, '$._source.search_only_fields.search_publisher') AS publisher,
    TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_year') AS INTEGER) AS publish_year,
    json_extract_string(json, '$._source.file_unified_data.cover_url_best') AS cover_url,
    json_extract(json, '$._source.search_only_fields.search_record_sources') AS sources,
    (
        json_extract(json, '$._source.file_unified_data.has_aa_downloads') = 'true'
        OR json_extract(json, '$._source.file_unified_data.has_aa_exclusive_downloads') = 'true'
        OR json_extract(json, '$._source.file_unified_data.has_torrent_paths') = 'true'
    ) AS is_archived
FROM read_ndjson_objects('/data/elasticsearch/*.json.gz')
LIMIT 10;
"""

df_advanced = conn.execute(advanced_query).fetchdf()
display(df_advanced)