In [1]:
# Import necessary libraries
%load_ext autoreload
%autoreload 2

import pandas as pd
import sqlite3
import xarray as xr
import numpy as np
from pyquadkey2 import quadkey




In [14]:
# SQLite connection
con = sqlite3.connect("../data/quadkeyDB.sqlite")



In [3]:
# Create table in SQLite if it does not exist
create_table_query = """
CREATE TABLE IF NOT EXISTS data_slice_male_long_lat (
    quadkey TEXT,
    raster_1 REAL,
    raster_2 REAL,
    raster_3 REAL,
    raster_4 REAL,
    raster_5 REAL,
    raster_6 REAL,
    raster_7 REAL,
    raster_8 REAL,
    raster_9 REAL,
    raster_10 REAL,
    raster_11 REAL,
    raster_12 REAL,
    raster_13 REAL,
    raster_14 REAL,
    raster_15 REAL,
    raster_16 REAL,
    raster_17 REAL,
    raster_18 REAL,
    raster_19 REAL,
    raster_20 REAL,
    raster_21 REAL,
    raster_22 REAL,
    raster_23 REAL,
    raster_24 REAL,
    raster_25 REAL,
    raster_26 REAL,
    raster_27 REAL,
    raster_28 REAL,
    raster_29 REAL,
    raster_30 REAL
);
"""
con.execute(create_table_query)
con.commit()



In [4]:
# Function to insert a DataFrame into the SQLite database
def insert_into_db(df):
    df.to_sql("data_slice_male_long_lat", con, if_exists="append", index=False)

# Function to load dataset

def load_dataset(file_path, lat_slice, lon_slice):
    dataset = xr.open_dataset(file_path)
    # Process dataset chunk
    data_slice = dataset.sel(latitude=lat_slice, longitude=lon_slice)
    data_slice = data_slice[
        "Basic Demographic Characteristics, v4.10 (2010): Male, Density, 2.5 arc-minutes"
    ]
    return data_slice.compute()

# Function to extract data points with quadkeys
def extract_data_points_vectorized_quadkey(ds):
    data_points = []
    longs = ds.longitude.values
    lats = ds.latitude.values

    zoom_level = 14  # Precision level

    for x in range(len(lats)):
        for y in range(len(longs)):
            coord = (lats[x], longs[y])
            tile = quadkey.from_geo(coord, zoom_level)
            quadkey_str = tile.key
            raster_values = ds.values[:, x, y]
            data_points.append([quadkey_str] + list(raster_values))

    columns = ['quadkey'] + [f'raster_{i + 1}' for i in range(ds.shape[0])]
    df = pd.DataFrame(data_points, columns=columns)
    return df

# Function to load dataset chunks into SQLite database
def load_male_dataset_into_db():
    file_path_male = "../data/gpw_v4_basic_demographic_characteristics_rev11_mt_2010_dens_2pt5_min.nc"

    for lat_start in range(85, -85, -10):
        lat_end = lat_start - 10
        lat_slice = slice(lat_start, lat_end)

        for lon_start in range(-180, 180, 10):
            lon_end = lon_start + 10
            lon_slice = slice(lon_start, lon_end)

            print(f"Processing lat: {lat_start} to {lat_end}, lon: {lon_start} to {lon_end}")

            data_slice_male = load_dataset(file_path_male, lat_slice, lon_slice)
            data_slice_male_quadkey = extract_data_points_vectorized_quadkey(data_slice_male)

            insert_into_db(data_slice_male_quadkey)

# Function to compute parent levels in quadkeys
def insert_parents_for_level(n, table_name):
    for group in range(0, 4):
        query = f"""
        INSERT INTO {table_name}
        SELECT
            SUBSTR(quadkey, 1, {n - 1}) AS parent_quadkey,
            AVG(raster_1) AS raster_1,
            AVG(raster_2) AS raster_2,
            AVG(raster_3) AS raster_3,
            AVG(raster_4) AS raster_4,
            AVG(raster_5) AS raster_5,
            AVG(raster_6) AS raster_6,
            AVG(raster_7) AS raster_7,
            AVG(raster_8) AS raster_8,
            AVG(raster_9) AS raster_9,
            AVG(raster_10) AS raster_10,
            AVG(raster_11) AS raster_11,
            AVG(raster_12) AS raster_12,
            AVG(raster_13) AS raster_13,
            AVG(raster_14) AS raster_14,
            AVG(raster_15) AS raster_15,
            AVG(raster_16) AS raster_16,
            AVG(raster_17) AS raster_17,
            AVG(raster_18) AS raster_18,
            AVG(raster_19) AS raster_19,
            AVG(raster_20) AS raster_20,
            AVG(raster_21) AS raster_21,
            AVG(raster_22) AS raster_22,
            AVG(raster_23) AS raster_23,
            AVG(raster_24) AS raster_24,
            AVG(raster_25) AS raster_25,
            AVG(raster_26) AS raster_26,
            AVG(raster_27) AS raster_27,
            AVG(raster_28) AS raster_28,
            AVG(raster_29) AS raster_29,
            AVG(raster_30) AS raster_30
        FROM {table_name}
        WHERE LENGTH(quadkey) = {n}
          AND SUBSTR(quadkey, 1, 1) = '{group}'
        GROUP BY parent_quadkey
        """
        con.execute(query)
        con.commit()

# Load all aggregation levels into database
def load_all_aggregation_data_into_db():
    n = 14
    for n in range(n, 0, -1):
        print(f"Processing aggregation for level {n}")
        insert_parents_for_level(n, "data_slice_male_long_lat")



In [5]:
# Load dataset and process
load_male_dataset_into_db()

Processing lat: 85 to 75, lon: -180 to -170
Processing lat: 85 to 75, lon: -170 to -160
Processing lat: 85 to 75, lon: -160 to -150
Processing lat: 85 to 75, lon: -150 to -140
Processing lat: 85 to 75, lon: -140 to -130
Processing lat: 85 to 75, lon: -130 to -120
Processing lat: 85 to 75, lon: -120 to -110
Processing lat: 85 to 75, lon: -110 to -100
Processing lat: 85 to 75, lon: -100 to -90
Processing lat: 85 to 75, lon: -90 to -80
Processing lat: 85 to 75, lon: -80 to -70
Processing lat: 85 to 75, lon: -70 to -60
Processing lat: 85 to 75, lon: -60 to -50
Processing lat: 85 to 75, lon: -50 to -40
Processing lat: 85 to 75, lon: -40 to -30
Processing lat: 85 to 75, lon: -30 to -20
Processing lat: 85 to 75, lon: -20 to -10
Processing lat: 85 to 75, lon: -10 to 0
Processing lat: 85 to 75, lon: 0 to 10
Processing lat: 85 to 75, lon: 10 to 20
Processing lat: 85 to 75, lon: 20 to 30
Processing lat: 85 to 75, lon: 30 to 40
Processing lat: 85 to 75, lon: 40 to 50
Processing lat: 85 to 75, lon:

In [6]:
load_all_aggregation_data_into_db()

Processing aggregation for level 14
Processing aggregation for level 13
Processing aggregation for level 12
Processing aggregation for level 11
Processing aggregation for level 10
Processing aggregation for level 9
Processing aggregation for level 8
Processing aggregation for level 7
Processing aggregation for level 6
Processing aggregation for level 5
Processing aggregation for level 4
Processing aggregation for level 3
Processing aggregation for level 2
Processing aggregation for level 1


In [7]:
# Create index for SQLite table
con.execute("CREATE UNIQUE INDEX IF NOT EXISTS quadkeyIndex ON data_slice_male_long_lat (quadkey);")
con.commit()



IntegrityError: UNIQUE constraint failed: data_slice_male_long_lat.quadkey

In [8]:
con.execute("CREATE INDEX IF NOT EXISTS quadkeyIndex ON data_slice_male_long_lat (quadkey);")
con.commit()

In [12]:
# Validate data
result = con.execute("SELECT * FROM data_slice_male_long_lat where quadkey = '' LIMIT 10;").fetchall()
print(result)



[]


In [10]:
# Validate data
result = con.execute("Delete FROM data_slice_male_long_lat WHERE quadkey = '';").fetchall()
print(result)

[]


In [23]:
# Close SQLite connection
con.close()

## For Showing the dataset

In [3]:
file_path_male = "../data/gpw_v4_basic_demographic_characteristics_rev11_mt_2010_dens_2pt5_min.nc"
dataset = xr.open_dataset(file_path_male)

dataset