# Advanced DuckDB Analysis

This notebook demonstrates more complex analysis using DuckDB, Pandas, and visualizations.

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt

# Connect to DuckDB
conn = duckdb.connect(database=':memory:', read_only=False)

## 1. ISBN Analysis

Analyze ISBN distribution and patterns:

In [None]:
isbn_query = """
SELECT
    (TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_isbn13[0]') AS BIGINT) - 978000000000) // 15625000 AS isbn_bucket,
    COUNT(*) as count
FROM read_ndjson_objects('/data/elasticsearch/*.json.gz')
WHERE json_extract_string(json, '$._source.search_only_fields.search_isbn13[0]') IS NOT NULL
GROUP BY isbn_bucket
ORDER BY count DESC
LIMIT 20;
"""

df_isbn = conn.execute(isbn_query).fetchdf()

# Visualize ISBN distribution
plt.figure(figsize=(12, 6))
plt.bar(df_isbn['isbn_bucket'], df_isbn['count'])
plt.title('ISBN Distribution by Bucket')
plt.xlabel('ISBN Bucket')
plt.ylabel('Count')
plt.show()

## 2. Publisher Analysis

Analyze top publishers and their publication patterns:

In [None]:
publisher_query = """
SELECT
    json_extract_string(json, '$._source.search_only_fields.search_publisher') AS publisher,
    COUNT(*) as total_books,
    COUNT(DISTINCT json_extract_string(json, '$._source.search_only_fields.search_author')) as unique_authors,
    AVG(TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_year') AS INTEGER)) as avg_year
FROM read_ndjson_objects('/data/elasticsearch/*.json.gz')
WHERE publisher IS NOT NULL
GROUP BY publisher
HAVING total_books > 10
ORDER BY total_books DESC
LIMIT 15;
"""

df_publishers = conn.execute(publisher_query).fetchdf()
display(df_publishers)

## 3. Archive Status Analysis

Analyze the archival status of books:

In [None]:
archive_query = """
WITH status AS (
    SELECT
        TRY_CAST(json_extract_string(json, '$._source.search_only_fields.search_year') AS INTEGER) AS year,
        (
            json_extract(json, '$._source.file_unified_data.has_aa_downloads') = 'true'
            OR json_extract(json, '$._source.file_unified_data.has_aa_exclusive_downloads') = 'true'
            OR json_extract(json, '$._source.file_unified_data.has_torrent_paths') = 'true'
        ) AS is_archived
    FROM read_ndjson_objects('/data/elasticsearch/*.json.gz')
    WHERE year IS NOT NULL
)
SELECT
    year,
    COUNT(*) as total,
    SUM(CASE WHEN is_archived THEN 1 ELSE 0 END) as archived,
    ROUND(100.0 * SUM(CASE WHEN is_archived THEN 1 ELSE 0 END) / COUNT(*), 2) as archived_percentage
FROM status
WHERE year BETWEEN 1990 AND 2023
GROUP BY year
ORDER BY year;
"""

df_archive = conn.execute(archive_query).fetchdf()

# Plot archive status over time
plt.figure(figsize=(15, 6))
plt.plot(df_archive['year'], df_archive['archived_percentage'], marker='o')
plt.title('Percentage of Archived Books by Year')
plt.xlabel('Year')
plt.ylabel('Archived Percentage')
plt.grid(True)
plt.show()