In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import duckdb
import pandas as pd
import xarray as xr
import numpy as np
from pyquadkey2 import quadkey

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# configurations on jupysql to directly output data to Pandas and to simplify the output
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [None]:
con = duckdb.connect("../data/qudkeyDB.duckdb") # Note: duckdb.sql connects to the default in-memory database connection
con.install_extension("spatial")
con.load_extension("spatial")
con.sql("CREATE TABLE if not exists data_slice_male_long_lat AS SELECT * FROM data_slice_male_quadkey_2")
con.sql("DESCRIBE data_slice_male_long_lat")

In [25]:
def insertIntoDB(df): # Insert data from data_slice_male_quadkey_2 into data_slice_male_long_lat
    con.sql(f"""
        INSERT INTO data_slice_male_long_lat
        SELECT * FROM df;
    """)


In [31]:
con.sql("""
        truncate table data_slice_male_long_lat;
    """)

In [3]:
def load_dataset(file_path, lat_slice, lon_slice):
    dataset = xr.open_dataset(file_path, chunks={'latitude': 1000, 'longitude': 1000})
    # TODO: Fix out of memory problem by using dask https://tutorial.xarray.dev/intermediate/xarray_and_dask.html
    data_slice = dataset.sel(latitude=lat_slice, longitude=lon_slice)
    data_slice = data_slice[
        "Basic Demographic Characteristics, v4.10 (2010): Male, Density, 2.5 arc-minutes"
    ]
    return data_slice.compute()

In [26]:
import pandas as pd
import numpy as np

def extract_data_points_vectorized_quadkey(ds):
    data_points = []
    longs = ds.longitude.values
    lats = ds.latitude.values

    zoom_level = 14  # Adjust as needed for desired precision

    for x in range(len(lats)):  # Iterate over latitude indices
        for y in range(len(longs)):  # Iterate over longitude indices
            # Convert latitude and longitude to quadkey
            coord = (lats[x], longs[y])
            tile = quadkey.from_geo(coord, zoom_level)
            quadkey_str = tile.key
            
            # Extract raster values
            raster_values = ds.values[:, x, y]
            
            # Append a row with quadkey and raster values
            data_points.append([quadkey_str] + list(raster_values))

    # Create a DataFrame from the data points
    columns = ['quadkey'] + [f'raster_{i + 1}' for i in range(ds.shape[0])]
    df = pd.DataFrame(data_points, columns=columns)

    return df


In [58]:
def load_male_dataset():
    file_path_male = "../data/gpw_v4_basic_demographic_characteristics_rev11_mt_2010_dens_2pt5_min.nc"

    # Initialize an empty DataFrame to store results
    # combined_df = pd.DataFrame()

    # Process the dataset in 10-degree latitude chunks
    # for lat_start in range(85, -90, -10):  # Iterate from 90 to -90 in steps of -10
    for lat_start in range(85, -85, -10):  # Iterate from 90 to -90 in steps of -10
        lat_end = lat_start - 10
        lat_slice = slice(lat_start, lat_end)

        # for lon_start in range(-180, 180, 10):  # Iterate from 90 to -90 in steps of -10
        for lon_start in range(-180, 180, 10):  # Iterate from 90 to -90 in steps of -10
            lon_end = lon_start + 10
            lon_slice = slice(lon_start, lon_end)
            print("lat: " + str(lat_start) + " long: " + str(lon_start))
            print("lat: " + str(lat_end) + " long: " + str(lon_end))

            # Load and process the chunk
            data_slice_male = load_dataset(file_path_male, lat_slice, lon_slice)
            data_slice_male_quadkey = extract_data_points_vectorized_quadkey(data_slice_male)

            #insertIntoDB(data_slice_male_quadkey)
            # Union the results into the combined DataFrame
            return data_slice_male_quadkey
            combined_df = pd.concat([combined_df, data_slice_male_quadkey], ignore_index=True)


    #return combined_df

In [50]:
def load_male_datasetIntoDB():
    file_path_male = "../data/gpw_v4_basic_demographic_characteristics_rev11_mt_2010_dens_2pt5_min.nc"

    # Initialize an empty DataFrame to store results
    # combined_df = pd.DataFrame()

    # Process the dataset in 10-degree latitude chunks
    # for lat_start in range(85, -90, -10):  # Iterate from 90 to -90 in steps of -10
    for lat_start in range(85, -85, -10):  # Iterate from 90 to -90 in steps of -10
        lat_end = lat_start - 10
        lat_slice = slice(lat_start, lat_end)

        # for lon_start in range(-180, 180, 10):  # Iterate from 90 to -90 in steps of -10
        for lon_start in range(-180, 180, 10):  # Iterate from 90 to -90 in steps of -10
            lon_end = lon_start + 10
            lon_slice = slice(lon_start, lon_end)
            print("lat: " + str(lat_start) + " long: " + str(lon_start))
            print("lat: " + str(lat_end) + " long: " + str(lon_end))

            # Load and process the chunk
            data_slice_male = load_dataset(file_path_male, lat_slice, lon_slice)
            data_slice_male_quadkey = extract_data_points_vectorized_quadkey(data_slice_male)

            insertIntoDB(data_slice_male_quadkey)
            # Union the results into the combined DataFrame
            # combined_df = pd.concat([combined_df, data_slice_male_quadkey], ignore_index=True)


    #return combined_df

In [59]:
var = load_male_dataset()

var.shape

lat: 85 long: -180
lat: 75 long: -170


(57600, 31)

In [35]:
con.sql("""
        SELECT * FROM data_slice_male_long_lat;
    """)

┌────────────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│    quadkey     │ raster_1 │ raster_2 │ raster_3 │ raster_4 │ raster_5 │ raster_6 │ raster_7 │ raster_8 │ raster_9 │ raster_10 │ raster_11 │ raster_12 │ raster_13 │ raster_14 │ raster_15 │ raster_16 │ raster_17 │ raster_18 │ raster_19 │ raster_20 │ raster_21 │ raster_22 │ raster_23 │ raster_24 │ raster_25 │ raster_26 │ raster_27 │ raster_28 │ raster_29 │ raster_30 │
│    varchar     │  float   │  float   │  float   │  float   │  float   │  float   │  float   │  float   │  float   │   float   │   float   │   float   │   float   │   float   │   float   │   float   │   float   │   float   │   float   │   float   │   float   

In [72]:
q = quadkey.from_str("00000000200202")
q2 = quadkey.from_str("00002300300021")
l = [q, q2]

values_clause = ', '.join(f"('{quadkey}')" for quadkey in l)
# Query with list binding
query = f"""
WITH quadkey_temp(quadkey) AS (
    VALUES {values_clause}
)
SELECT t.*
FROM data_slice_male_long_lat t
JOIN quadkey_temp q
ON t.quadkey = q.quadkey
"""
# Execute the query
con.execute(query).fetchall()

[('00000000200202',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  3.0,
  None,
  None,
  32767.0,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None),
 ('00002300300021',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  3.0,
  None,
  None,
  32767.0,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None)]

In [73]:
def getData(quadkeys):
    values_clause = ', '.join(f"('{quadkey}')" for quadkey in quadkeys)
    # Query with list binding
    query = f"""
    WITH quadkey_temp(quadkey) AS (
        VALUES {values_clause}
    )
    SELECT t.*
    FROM data_slice_male_long_lat t
    JOIN quadkey_temp q
    ON t.quadkey = q.quadkey
    """
    # Execute the query
    return con.execute(query).fetchall()
    

In [74]:
q = quadkey.from_str("00000000200202")
q2 = quadkey.from_str("00002300300021")
l = [q, q2]
getData(l)

[('00000000200202',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  3.0,
  None,
  None,
  32767.0,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None),
 ('00002300300021',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  3.0,
  None,
  None,
  32767.0,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None)]

In [82]:
# GET https://c.tile.openstreetmap.org/12/678/1594.png
# 12/678/1594 -> z/x/y
z = 11
x = 678
y = 1594
n = 3 # amount of datapoint we want per tile. 3 would be 4^3 = 64

qkey = quadkey.from_tile((x,y), z) # get quadtree for this tile
listofQKeys = qkey.children(z+n) # get all children 3 levels deeper -> 64 data points for this tile

res = getData(listofQKeys) # load all data for all quadkeys from the database



In [None]:
def loadTileMale(z, y, x):
    qkey = quadkey.from_tile((x,y), z) # get quadtree for this tile
    listofQKeys = qkey.children(z+n) # get all children 3 levels deeper -> 64 data points for this til
    return getData(listofQKeys)

10