In [2]:
# Import necessary libraries
%load_ext autoreload
%autoreload 2

import pandas as pd
import sqlite3
import sqlite3
import xarray as xr
import numpy as np
from pyquadkey2 import quadkey
from joblib import Parallel, delayed




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
# SQLite connection
db_Path = "../../data/quadkeyDB.sqlite"
con = sqlite3.connect(db_Path)


In [6]:
# Create table in SQLite if it does not exist
create_table_query = """
CREATE TABLE IF NOT EXISTS data_slice_male_long_lat (
    quadkey TEXT,
    raster_1 REAL,
    raster_2 REAL,
    raster_3 REAL,
    raster_4 REAL,
    raster_5 REAL,
    raster_6 REAL,
    raster_7 REAL,
    raster_8 REAL,
    raster_9 REAL,
    raster_10 REAL,
    raster_11 REAL,
    raster_12 REAL,
    raster_13 REAL,
    raster_14 REAL,
    raster_15 REAL,
    raster_16 REAL,
    raster_17 REAL,
    raster_18 REAL,
    raster_19 REAL,
    raster_20 REAL,
    raster_21 REAL,
    raster_22 REAL,
    raster_23 REAL,
    raster_24 REAL,
    raster_25 REAL,
    raster_26 REAL,
    raster_27 REAL,
    raster_28 REAL,
    raster_29 REAL,
    raster_30 REAL
);
"""
con.execute(create_table_query)
con.commit()



In [7]:
# Create index for SQLite table
con.execute("CREATE UNIQUE INDEX IF NOT EXISTS quadkeyIndex ON data_slice_male_long_lat (quadkey);")
con.commit()



In [8]:
# SQLite DB Tweaks for performance
con.execute("PRAGMA synchronous = OFF;")
con.execute("PRAGMA cache_size = -500000;  -- 500 MB;")
con.commit()

In [9]:
def get_offset_quadkey(lat, lon, zoom_level, offset_minutes=2.5):
    # Convert offset from minutes to degrees
    offset_degrees = offset_minutes / 60.0  # 1 minute = 1/60 degrees

    # Adjust the latitude and longitude
    adjusted_lat = lat + offset_degrees
    adjusted_lon = lon + offset_degrees

    # Generate the quadkey for the adjusted coordinate
    quadkey_str = quadkey.from_geo((adjusted_lat, adjusted_lon), zoom_level)
    return quadkey_str


In [10]:
def extract_data_points_vectorized_quadkey(ds):
    data_points = []
    longs = ds.longitude.values
    lats = ds.latitude.values
    zoom_level = 13  # Precision level for quadkey generation


    lat_grid, lon_grid = np.meshgrid(lats, longs, indexing="ij")
    # Flatten the grids for vectorized operations
    lat_flat = lat_grid.ravel()
    lon_flat = lon_grid.ravel()
    # Generate quadkeys for all points
    coords = np.column_stack((lat_flat, lon_flat))
    quadkeysList = []
    for coord in coords:
        try:
            q1 = quadkey.from_geo(coord, zoom_level)
            q2 = get_offset_quadkey(coord[0], coord[1], zoom_level)
            quadkeyList = q1.difference(q2)
            quadkeysList.append(quadkeyList)
        except AssertionError:
            print(f"Invalid coordinate causing AssertionError: {coord}")
    # quadkeys = [quadkey.from_geo(coord, zoom_level).key for coord in coords]
    # Extract raster values corresponding to the lat/lon indices
    raster_values = []
    for i, (lat, lon) in enumerate(coords):
        # Find the closest indices in the dataset for the current lat/lon
        x_idx = np.abs(lats - lat).argmin()
        y_idx = np.abs(longs - lon).argmin()
        # Extract the raster values at the closest indices
        values = ds.values[:, x_idx, y_idx]  # Adjust if your dataset structure differs
        raster_values.append(values)
    # Combine quadkeys and raster values
    # Loop through each quadkey list and corresponding raster values
    for quadkey_list, raster_value_row in zip(quadkeysList, raster_values):
        for qkey in quadkey_list:
            # Append the quadkey string and raster values as a new data point
            data_points.append([qkey.key] + list(raster_value_row))

    # Convert to DataFrame
    columns = ['quadkey'] + [f'raster_{i + 1}' for i in range(ds.shape[0])]
    df = pd.DataFrame(data_points, columns=columns)

    # Replace NaN with None to ensure database compatibility
    df = df.where(pd.notnull(df), None)

    # Remove duplicates based on quadkeys
    df = df.drop_duplicates(subset='quadkey')
    return df

In [17]:


data_table = "data_slice_male_long_lat"
file_path = "../../data/gpw_v4_basic_demographic_characteristics_rev11_mt_2010_dens_2pt5_min.nc"


def load_dataset(file_path, lat_slice, lon_slice):
    dataset = xr.open_dataset(file_path)
    # Process dataset chunk
    data_slice = dataset.sel(latitude=lat_slice, longitude=lon_slice)
    data_slice = data_slice[
        "Basic Demographic Characteristics, v4.10 (2010): Male, Density, 2.5 arc-minutes"
    ]
    return data_slice.compute()


def insert_into_db(data_slice):
    # Increase the timeout to 30 seconds (default is 5 seconds)
    conn = sqlite3.connect(db_path, timeout=300)
    cursor = conn.cursor()

    try:
        # Batch insert data
        placeholders = ", ".join(["?"] * len(data_slice.columns))
        insert_query = f"INSERT OR IGNORE INTO {data_table} VALUES ({placeholders})"
        cursor.executemany(insert_query, data_slice.values.tolist())

        # Commit the transaction
        conn.commit()
    except sqlite3.OperationalError as e:
        print(f"Error while inserting data: {e}")
    finally:
        # Ensure the connection is closed
        conn.close()



def process_slice(lat_start, lat_end, lon_start, lon_end):
    lat_slice = slice(lat_start, lat_end)
    lon_slice = slice(lon_start, lon_end)

     # Load the data slice
    data_slice = load_dataset(file_path, lat_slice, lon_slice)

    # Extract data points with quadkeys
    data_slice_quadkey = extract_data_points_vectorized_quadkey(data_slice)

    # Insert into the database
    insert_into_db(data_slice_quadkey)

def load_male_dataset_into_db():
    # Define all lat/lon chunks
    # Smaller latitude and longitude steps
    tasks = [
        (lat_start, max(lat_start - 10, -85.05112878), lon_start, min(lon_start + 10, 180))
        # full map range
        for lat_start in range(85, -85, -1)  
        for lon_start in range(-180, 180, 1)
        # inhabited area only
        # for lat_start in range(80, -56, -10)
        # for lon_start in range(-180, 180, 10)
        ]


    # Process all tasks in parallel
    Parallel(n_jobs=-1)(delayed(process_slice)(*task) for task in tasks)


In [18]:
# Function to compute parent levels in quadkeys
def insert_parents_for_level(n, table_name):
    query = f"""
        INSERT INTO {table_name}
        SELECT
            SUBSTR(quadkey, 1, {n - 1}) AS parent_quadkey,
            AVG(raster_1) AS raster_1,
            AVG(raster_2) AS raster_2,
            AVG(raster_3) AS raster_3,
            AVG(raster_4) AS raster_4,
            AVG(raster_5) AS raster_5,
            AVG(raster_6) AS raster_6,
            AVG(raster_7) AS raster_7,
            AVG(raster_8) AS raster_8,
            AVG(raster_9) AS raster_9,
            AVG(raster_10) AS raster_10,
            AVG(raster_11) AS raster_11,
            AVG(raster_12) AS raster_12,
            AVG(raster_13) AS raster_13,
            AVG(raster_14) AS raster_14,
            AVG(raster_15) AS raster_15,
            AVG(raster_16) AS raster_16,
            AVG(raster_17) AS raster_17,
            AVG(raster_18) AS raster_18,
            AVG(raster_19) AS raster_19,
            AVG(raster_20) AS raster_20,
            AVG(raster_21) AS raster_21,
            AVG(raster_22) AS raster_22,
            AVG(raster_23) AS raster_23,
            AVG(raster_24) AS raster_24,
            AVG(raster_25) AS raster_25,
            AVG(raster_26) AS raster_26,
            AVG(raster_27) AS raster_27,
            AVG(raster_28) AS raster_28,
            AVG(raster_29) AS raster_29,
            AVG(raster_30) AS raster_30
        FROM {table_name}
        WHERE LENGTH(quadkey) = {n}
        GROUP BY parent_quadkey
        """
    con.execute(query)
    con.commit()

# Load all aggregation levels into database
def load_all_aggregation_data_into_db(n):
    for n in range(n, 0, -1):
        print(f"Processing aggregation for level {n}")
        insert_parents_for_level(n, "data_slice_male_long_lat")


In [None]:
# Load dataset into DB for Zoom Level 13
load_male_dataset_into_db()
# load_all_aggregation_data_into_db()

In [None]:
# Load higher Quadkey levels into DB for level < 13
load_all_aggregation_data_into_db(13)

In [20]:
con.close()