### Import Required Libraries

In [199]:
import getpass
from pathlib import Path
import os, json
import itertools as it
import requests
from datetime import datetime
from tqdm import tqdm
from glob import glob

import shapely
from geopandas import GeoDataFrame
import rioxarray
import stackstac as ss
import numpy as np
import dask
from dask.distributed import Client as ds_client
import warnings

import pystac as ps
import pystac.utils as ps_utils
from pystac_client import Client as ps_client
from pystac import Item, Collection, Catalog
from pystac.extensions.label import LabelExtension, LabelRelType
from pystac.item_collection import ItemCollection

SAMPLE_SIZE = 5

### Input MLHub API Key

In [3]:
MLHUB_API_KEY = getpass.getpass(prompt="MLHub API Key: ")
MLHUB_ROOT_URL = "https://api.radiant.earth/mlhub/v1"

MLHub API Key:  ································································


In [4]:
MLHUB_API_ROOT = "http://api.radiant.earth/mlhub/v1/collections"

In [71]:
# MLHUB_API_KEY

### Authenticate Token with MLHub Session

In [218]:
class MLHubSession(requests.Session):
    def __init__(self, *args, api_key=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.params.update({"key": api_key})

    def request(self, method, url, *args, **kwargs):
        url_prefix = MLHUB_ROOT_URL.rstrip("/") + "/"
        url = urljoin(url_prefix, url)
        return super().request(method, url, *args, **kwargs)

In [219]:
session = MLHubSession(api_key=MLHUB_API_KEY)

### Create MLHub API Client

In [98]:
ps_api_client = ps_client.open(
    MLHUB_ROOT_URL, 
    parameters={"key": MLHUB_API_KEY}, 
    ignore_conformance=True
)

In [99]:
collections = ps_api_client.get_collections()

### Fetch source and label collections

In [100]:
lcn_source_name, lcn_source = "ref_landcovernet_v1_source", None
lcn_labels_name, lcn_labels = "ref_landcovernet_v1_labels", None

In [101]:
for coll in collections:
    if coll.id == lcn_labels_name:
        lcn_labels = coll
    elif coll.id == lcn_source_name:
        lcn_source = coll

In [78]:
# lcn_labels.describe()

In [79]:
import numpy as np
np_rng = np.random.default_rng()

In [80]:
# lcn_label_items = lcn_labels.get_all_items_as_dict()

In [138]:
%%time
lcn_label_items = lcn_labels.get_items()
lcn_source_items = lcn_source.get_items()

CPU times: user 14 µs, sys: 33 µs, total: 47 µs
Wall time: 54.1 µs


In [164]:
lcn_labels.extent.spatial.bboxes[0]

[-15.937860500150009,
 -31.68783755999348,
 46.87392100826879,
 31.339825523515234]

In [127]:
lcn_temporal_extent = lcn_labels.extent.temporal.intervals[0]
lcn_temporal_extent

[datetime.datetime(2018, 1, 1, 0, 0, tzinfo=tzutc()),
 datetime.datetime(2018, 12, 31, 0, 0, tzinfo=tzutc())]

In [129]:
search_start = lcn_temporal_extent[0].strftime('%Y-%m-%d')
search_end = lcn_temporal_extent[1].strftime('%Y-%m-%d')

In [118]:
next(lcn_label_items).to_dict()

{'type': 'Feature',
 'stac_version': '1.0.0',
 'id': 'ref_landcovernet_v1_labels_38PKT_26',
 'properties': {'labels': ['Artificial Bareground',
   'Natural Bareground',
   'Woody Vegetation',
   '(Semi) Natural Vegetation'],
  'datetime': '2018-07-01T00:00:00Z',
  'label:type': 'raster',
  'label:classes': [{'name': 'labels',
    'classes': ['No Data',
     'Water',
     'Artificial Bareground',
     'Natural Bareground',
     'Permanent Snow/Ice',
     'Woody Vegetation',
     'Cultivated Vegetation',
     '(Semi) Natural Vegetation']}],
  'label:properties': ['labels'],
  'label:description': 'Land Cover Type Classification'},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[42.725265852059046, 10.809570377835808],
    [42.7250913255346, 10.832704943166526],
    [42.748493655792764, 10.832876782947013],
    [42.74866639065317, 10.809741842108567],
    [42.725265852059046, 10.809570377835808]]]},
 'links': [{'rel': 'collection',
   'href': 'http://api.radiant.earth/mlhub/v1/collec

In [82]:
i=0
while i < 5:
    first_label = next(lcn_label_items)
    first_source = next(lcn_source_items)
    print(first_label.id, first_source.id)
    i+=1

ref_landcovernet_v1_labels_38PKT_29 ref_landcovernet_v1_source_38NPP_29_20181231
ref_landcovernet_v1_labels_38PKT_28 ref_landcovernet_v1_source_38NPP_28_20181231
ref_landcovernet_v1_labels_38PKT_27 ref_landcovernet_v1_source_38NPP_27_20181231
ref_landcovernet_v1_labels_38PKT_26 ref_landcovernet_v1_source_38NPP_26_20181231
ref_landcovernet_v1_labels_38PKT_25 ref_landcovernet_v1_source_38NPP_25_20181231


#### It is estimated this will take over 30 minutes for all source items

In [83]:
# %%time
# lcn_source_item_list = list(lcn_source_items)

#### Conversion from labels generator to list takes 24 seconds

In [84]:
%%time
lcn_label_item_list = list(lcn_label_items)

CPU times: user 1.33 s, sys: 94.7 ms, total: 1.42 s
Wall time: 20.9 s


In [85]:
%%time
sample_label_items = np_rng.choice(a=lcn_label_item_list, size=SAMPLE_SIZE)

CPU times: user 5 ms, sys: 3.76 ms, total: 8.76 ms
Wall time: 13 ms


In [86]:
# sample_label_items[0].get_links(rel=LabelRelType.SOURCE)[0:10]

In [87]:
sample_label_items

array([<Item id=ref_landcovernet_v1_labels_29PNM_21>,
       <Item id=ref_landcovernet_v1_labels_33PWQ_14>,
       <Item id=ref_landcovernet_v1_labels_37PCM_04>,
       <Item id=ref_landcovernet_v1_labels_32PQQ_09>,
       <Item id=ref_landcovernet_v1_labels_34MBC_20>], dtype=object)

In [42]:
for label_item in sample_label_items[:1]:
    source_item_links = label_item.get_links(rel=LabelRelType.SOURCE)
    print(source_item_links)

[<Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180105>, <Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180110>, <Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180115>, <Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180120>, <Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180125>, <Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180130>, <Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/it

#### .get_item() on n items will be O(n^2) as it walks the entire collection for each item ID

In [46]:
%%time
lcn_source.get_item(os.path.split(source_item_links[0].target)[-1], recursive=False)

KeyboardInterrupt: 

In [107]:
source_item_links[0]

<Link rel=source target=http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_35KNT_04_20180105>

In [106]:
source_item_links[0].resolve_stac_object(stac_io=ps_client._stac_io).target

TypeError: resolve_stac_object() got an unexpected keyword argument 'stac_io'

In [90]:
# lcn_source.get_item('ref_landcovernet_v1_source_35KNT_04_20180105')

In [96]:
%%time
sample_source_items = []
for label in sample_label_items:
    source_item_id = label.id.replace('_labels_', '_source_')
    if source_item_id:
        sample_source_items.append(lcn_source_items.get_item(source_item_id + '_20180902'))
    else:
        print(f'Could not find source items for {label.id}')

ref_landcovernet_v1_labels_34NCG_16 ref_landcovernet_v1_source_34NCG_16
ref_landcovernet_v1_labels_35NLA_16 ref_landcovernet_v1_source_35NLA_16
ref_landcovernet_v1_labels_31PGR_20 ref_landcovernet_v1_source_31PGR_20
ref_landcovernet_v1_labels_32PLS_28 ref_landcovernet_v1_source_32PLS_28
ref_landcovernet_v1_labels_29PNM_27 ref_landcovernet_v1_source_29PNM_27
ref_landcovernet_v1_labels_33MZM_14 ref_landcovernet_v1_source_33MZM_14
ref_landcovernet_v1_labels_37NFD_26 ref_landcovernet_v1_source_37NFD_26
ref_landcovernet_v1_labels_33PUQ_16 ref_landcovernet_v1_source_33PUQ_16
ref_landcovernet_v1_labels_35JKL_24 ref_landcovernet_v1_source_35JKL_24
ref_landcovernet_v1_labels_38PKQ_26 ref_landcovernet_v1_source_38PKQ_26
CPU times: user 1.17 ms, sys: 482 µs, total: 1.65 ms
Wall time: 3.86 ms


In [103]:
sample_source_ids = [label.id.replace('_labels_', '_source_') for label in sample_label_items]
sample_source_ids

['ref_landcovernet_v1_source_34NCG_16',
 'ref_landcovernet_v1_source_35NLA_16',
 'ref_landcovernet_v1_source_31PGR_20',
 'ref_landcovernet_v1_source_32PLS_28',
 'ref_landcovernet_v1_source_29PNM_27',
 'ref_landcovernet_v1_source_33MZM_14',
 'ref_landcovernet_v1_source_37NFD_26',
 'ref_landcovernet_v1_source_33PUQ_16',
 'ref_landcovernet_v1_source_35JKL_24',
 'ref_landcovernet_v1_source_38PKQ_26']

In [105]:
sample_source_items = []

In [108]:
lcn_source_items

<generator object CollectionClient.get_items at 0x175a8e270>

In [126]:
blob_url = 'https://radiantmlhub.blob.core.windows.net/stac/landcovernet-v1/catalog.json'

In [133]:
# blob_cat = ps.read_file(blob_url, stac_io = ps_api_client._stac_io)

In [109]:
# %%time
# for source_item in lcn_source_items:
#     if source_item.id in sample_source_ids:
#         sample_source_items.append(source_item)

In [12]:
lcn_labels.id

'ref_landcovernet_v1_labels'

In [13]:
lcn_labels_collection = Collection(
    id = lcn_labels.id,
    description = lcn_labels.description,
    extent = lcn_labels.extent
)

In [14]:
lcn_labels_collection.add_items(lcn_label_items)

In [73]:
lcn_source_collection = Collection(
    id = lcn_source.id,
    description = lcn_source.description,
    extent = lcn_source.extent
)

In [75]:
lcn_source_collection.add_items(lcn_source_items)

In [83]:
len(lcn_source_collection.to_dict()['links'])

102

In [84]:
lcn_source_collection.to_dict()

{'type': 'Collection',
 'id': 'ref_landcovernet_v1_source',
 'stac_version': '1.0.0',
 'description': 'LandCoverNet Source Imagery',
 'links': [{'rel': <RelType.ROOT: 'root'>,
   'href': None,
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.ITEM: 'item'>,
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_38NPP_29_20181231',
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.ITEM: 'item'>,
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_38NPP_28_20181231',
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.ITEM: 'item'>,
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_38NPP_27_20181231',
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.ITEM: 'item'>,
   'href': 'http://api.radiant.earth/mlhub/v1/collections

### Create template Catalog, add Collections

In [114]:
lcn_catalog = Catalog(
    id="landcovernet_v1",
    description="LandCoverNet",
    href="."
)
lcn_catalog._stac_io = ps_api_client._stac_io

In [115]:
lcn_catalog.add_children([lcn_source, lcn_labels])

In [116]:
lcn_catalog.to_dict()

{'type': 'Catalog',
 'id': 'landcovernet_v1',
 'stac_version': '1.0.0',
 'description': 'LandCoverNet',
 'links': [{'rel': <RelType.ROOT: 'root'>,
   'href': '/Users/kendallsmith/scalable-model-training',
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.SELF: 'self'>,
   'href': '/Users/kendallsmith/scalable-model-training',
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.CHILD: 'child'>,
   'href': '/Users/kendallsmith/ref_landcovernet_v1_source/collection.json',
   'type': <MediaType.JSON: 'application/json'>},
  {'rel': <RelType.CHILD: 'child'>,
   'href': '/Users/kendallsmith/ref_landcovernet_v1_labels/collection.json',
   'type': <MediaType.JSON: 'application/json'>}],
 'stac_extensions': []}

In [117]:
for collection_link in lcn_catalog.get_child_links():
    print(collection_link)

<Link rel=child target=<CollectionClient id=ref_landcovernet_v1_source>>
<Link rel=child target=<CollectionClient id=ref_landcovernet_v1_labels>>


In [123]:
with open('labels_collection.json', 'w') as out_file:
    json.dump(lcn_catalog.get_child('ref_landcovernet_v1_labels').to_dict(), out_file)

In [124]:
with open('source_collection.json', 'w') as out_file:
    json.dump(lcn_catalog.get_child('ref_landcovernet_v1_source').to_dict(), out_file)

In [104]:
%%time
lcn_catalog.get_child('ref_landcovernet_v1_labels').get_item('ref_landcovernet_v1_labels_38NPP_22').to_dict()

NameError: name 'lcn_catalog' is not defined

In [112]:
next(next(lcn_catalog.get_children()).get_items()).to_dict()

{'type': 'Feature',
 'stac_version': '1.0.0',
 'id': 'ref_landcovernet_v1_source_38NPP_29_20181231',
 'properties': {'gsd': 10,
  'datetime': '2018-12-31T00:00:00Z',
  'eo:bands': [{'name': 'B01',
    'common_name': 'Coastal Aerosol',
    'description': 'Coastal Aerosol'},
   {'name': 'B02', 'common_name': 'Blue', 'description': 'Blue'},
   {'name': 'B03', 'common_name': 'Green', 'description': 'Green'},
   {'name': 'B04', 'common_name': 'Red', 'description': 'Red'},
   {'name': 'B05',
    'common_name': 'Vegetation Red Edge',
    'description': 'Vegetation Red Edge (704.1nm)'},
   {'name': 'B06',
    'common_name': 'Vegetation Red Edge',
    'description': 'Vegetation Red Edge (740.1nm)'},
   {'name': 'B07',
    'common_name': 'Vegetation Red Edge',
    'description': 'Vegetation Red Edge (782.8nm)'},
   {'name': 'B08', 'common_name': 'NIR', 'description': 'NIR'},
   {'name': 'B8A', 'common_name': 'Narrow NIR', 'description': 'Narrow NIR'},
   {'name': 'B09',
    'common_name': 'Water

### Make Catalog child Links relative to Catalog root

In [40]:
for collection_link in lcn_catalog.get_child_links():
    # Get the collection link relative to the original catalog at the broken Azure blob URL
    collection_link.target = ps_utils.make_relative_href(
        collection_link.get_target_str(),
        start_href=MLHUB_API_ROOT,
        start_is_dir=True
    )

In [41]:
for collection_link in lcn_catalog.get_child_links():
    print(collection_link.get_target_str())

/Users/kendallsmith/ref_landcovernet_v1_source/collection.json
/Users/kendallsmith/ref_landcovernet_v1_labels/collection.json


### Make Collections Links relative to Catalog root

In [42]:
%%time
for collection in lcn_catalog.get_collections():
    print(f'Updating relative paths for {collection.id}')
    
    for item_link in collection.get_item_links():
        # First, get the Item's URL relative to the original catalog at the broken Azure URL
        relative_to_root = ps_utils.make_relative_href(
            item_link.get_target_str(),
            start_href=MLHUB_API_ROOT,
            start_is_dir=True
        )
        # print(relative_to_root)
        
        # Next, get the absolute local path and then the path relative to the collection
        absolute_path = ps_utils.make_absolute_href(
            relative_to_root,
            start_href=collection.get_root_link().href
        )
        # print(absolute_path)
        
        relative_to_collection = ps_utils.make_relative_href(
            absolute_path,
            start_href=collection.get_self_href()
        )
        item_link.target = relative_to_collection

Updating relative paths for ref_landcovernet_v1_source
Updating relative paths for ref_landcovernet_v1_labels


In [49]:
# destination_catalog_dir = Path(os.getcwd()) / "output"
destination_catalog_dir = os.path.join(os.getcwd(), 'output')

### Make Item Links relative to Collections

In [50]:
%%time
for current_catalog, sub_catalogs, items in lcn_catalog.walk():
    for item in items:
        # WARNING: NOT IDEMPOTENT
        # Prefix the Item ID with the Collection ID
        item.id = current_catalog.id + "-" + item.id
        item.set_root(lcn_catalog)
        item.set_collection(current_catalog)

        # Fix the asset HREFs
        for asset in item.assets.values():
            relative_to_root = ps_utils.make_relative_href(
                asset.href,
                start_href=MLHUB_API_ROOT,
                start_is_dir=True
            )
            asset.href = ps_utils.make_absolute_href(
                relative_to_root,
                start_href=destination_catalog_dir,
                start_is_dir=True
            )
        
        # Fix the source links, if present
        for source_link in item.get_links(rel=LabelRelType.SOURCE):
            # First, get the path relative to the old root
            relative_to_root = ps_utils.make_relative_href(
                source_link.get_target_str(),
                start_href=MLHUB_API_ROOT,
                start_is_dir=True
            )
            # Next get the absolute local path and then the path relative to THIS ITEM
            absolute_path = ps_utils.make_absolute_href(
                relative_to_root,
                start_href=item.get_root_link().href
            )
            source_link.target = ps_utils.make_relative_href(
                absolute_path,
                start_href=item.get_self_href()
            )

KeyboardInterrupt: 

In [72]:
for current_catalog, sub_catalogs, items in lcn_catalog.walk():
    for catalog in sub_catalogs:
        print(catalog.get_self_href())

http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source
http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_labels


### Instantiate Dask Client

In [8]:
client = ds_client()
client.run(lambda: warnings.filterwarnings("ignore", "Creating an ndarray from ragged"))
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 55635 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:55635/status,

0,1
Dashboard: http://127.0.0.1:55635/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:55636,Workers: 4
Dashboard: http://127.0.0.1:55635/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:55647,Total threads: 2
Dashboard: http://127.0.0.1:55651/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55641,
Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-xpd1hf6j,Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-xpd1hf6j

0,1
Comm: tcp://127.0.0.1:55650,Total threads: 2
Dashboard: http://127.0.0.1:55656/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55642,
Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-ye326_5_,Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-ye326_5_

0,1
Comm: tcp://127.0.0.1:55648,Total threads: 2
Dashboard: http://127.0.0.1:55652/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55639,
Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-_a_cr9tp,Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-_a_cr9tp

0,1
Comm: tcp://127.0.0.1:55649,Total threads: 2
Dashboard: http://127.0.0.1:55653/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55640,
Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-r14jqhy8,Local directory: /Users/kendallsmith/scalable-model-training/dask-worker-space/worker-r14jqhy8


#### Load Label Items from raw JSON collection

In [146]:
item_files = glob('/Users/kendallsmith/scalable-model-training/landcovernet-v1/landcovernet_v1_labels/**/*.json', recursive=True)

In [147]:
item_files = [file for file in item_files if not file.__contains__('collection.json') and not file.__contains__('catalog.json')]

In [148]:
item_files[0:5]

['/Users/kendallsmith/scalable-model-training/landcovernet-v1/landcovernet_v1_labels/landcovernet_v1_labels_38NPP_05/landcovernet_v1_labels_38NPP_05.json',
 '/Users/kendallsmith/scalable-model-training/landcovernet-v1/landcovernet_v1_labels/landcovernet_v1_labels_30PWT_01/landcovernet_v1_labels_30PWT_01.json',
 '/Users/kendallsmith/scalable-model-training/landcovernet-v1/landcovernet_v1_labels/landcovernet_v1_labels_35MNT_01/landcovernet_v1_labels_35MNT_01.json',
 '/Users/kendallsmith/scalable-model-training/landcovernet-v1/landcovernet_v1_labels/landcovernet_v1_labels_32PPA_05/landcovernet_v1_labels_32PPA_05.json',
 '/Users/kendallsmith/scalable-model-training/landcovernet-v1/landcovernet_v1_labels/landcovernet_v1_labels_31PGS_02/landcovernet_v1_labels_31PGS_02.json']

In [149]:
len(item_files)

1980

In [170]:
features = []
last_idx = 0

In [171]:
%%time
for idx, item_filepath in enumerate(item_files):
    
    with open(item_filepath, 'r') as in_file:
        label_data = json.load(in_file)
        
    label_feature = {
        "id": str(last_idx),
        "type": label_data['type'],
        "properties": {
            "name": os.path.split(item_filepath)[-1]
        },
        "geometry": label_data['geometry'],
        "bbox": label_data['bbox'],
        
    }
    
    features.append(label_feature)
    last_idx = last_idx + idx + 1

CPU times: user 157 ms, sys: 67.2 ms, total: 224 ms
Wall time: 572 ms


In [172]:
features[0:5]

[{'id': '0',
  'type': 'Feature',
  'properties': {'name': 'landcovernet_v1_labels_38NPP_05.json'},
  'geometry': {'type': 'Polygon',
   'coordinates': [[[46.22372650315114, 7.637727360104802],
     [46.223792488589126, 7.66087904636043],
     [46.24699893998356, 7.660812489646023],
     [46.24693170406418, 7.637661006840825],
     [46.22372650315114, 7.637727360104802]]]},
  'bbox': [46.22372650315114,
   7.637661006840825,
   46.24699893998356,
   7.66087904636043]},
 {'id': '1',
  'type': 'Feature',
  'properties': {'name': 'landcovernet_v1_labels_30PWT_01.json'},
  'geometry': {'type': 'Polygon',
   'coordinates': [[[-2.110483678027241, 11.084611605508577],
     [-2.110413637624084, 11.107761306888117],
     [-2.086974187855211, 11.107691143828443],
     [-2.08704607311562, 11.08454159225002],
     [-2.110483678027241, 11.084611605508577]]]},
  'bbox': [-2.110483678027241,
   11.08454159225002,
   -2.086974187855211,
   11.107761306888117]},
 {'id': '3',
  'type': 'Feature',
  'pro

In [173]:
feature_collection = {
    "type": "FeatureCollection",
    "features": features,
    "bbox": lcn_labels.extent.spatial.bboxes[0]
}

In [174]:
lcn_gdf = GeoDataFrame.from_features(feature_collection)

In [175]:
lcn_gdf.head()

Unnamed: 0,geometry,name
0,"POLYGON ((46.22373 7.63773, 46.22379 7.66088, ...",landcovernet_v1_labels_38NPP_05.json
1,"POLYGON ((-2.11048 11.08461, -2.11041 11.10776...",landcovernet_v1_labels_30PWT_01.json
2,"POLYGON ((27.33485 -2.70273, 27.33485 -2.67957...",landcovernet_v1_labels_35MNT_01.json
3,"POLYGON ((10.26125 14.04015, 10.26138 14.06329...",landcovernet_v1_labels_32PPA_05.json
4,"POLYGON ((5.26197 15.25760, 5.26222 15.28072, ...",landcovernet_v1_labels_31PGS_02.json


In [204]:
json.dump(shapely.geometry.mapping(lcn_gdf.unary_union.convex_hull), open('unary_geometry.json', 'w'))

In [210]:
json.dump(shapely.geometry.mapping(lcn_gdf.iloc[0:20].unary_union), open('unary_geometry2.json', 'w'))

In [205]:
os.getcwd()

'/Users/kendallsmith/scalable-model-training'

In [177]:
# !pip install dask_geopandas

In [179]:
import dask_geopandas

In [211]:
NPARTITIONS = 1980
lcn_dgdf = dask_geopandas.from_geopandas(lcn_gdf, npartitions=NPARTITIONS, sort=False)

In [None]:
def query(labels):
    """
    Find STAC items for points in the `points` DataFrame

    Parameters
    ----------
    points : geopandas.GeoDataFrame
        A GeoDataFrame

    Returns
    -------
    geopandas.GeoDataFrame
        A new geopandas.GeoDataFrame with a `stac_item` column containing the STAC
        item that covers each point.
    """
    intersects = shapely.geometry.mapping(labels.unary_union.convex_hull)

    # search_start = "2018-01-01"
    # search_end = "2019-12-31"
    catalog = pystac_client.Client.open(MLHUB_ROOT_URL)

    # The time frame in which we search for non-cloudy imagery
    search = catalog.search(
        collections=[lcn_labels_name],
        intersects=intersects,
        datetime=[search_start, search_end],
        query={}, # "eo:cloud_cover": {"lt": 10}
        limit=100,
    )
    ic = search.get_all_items_as_dict()

    features = ic["features"]
    features_d = {item["id"]: item for item in features}

    data = {
        #"eo:cloud_cover": [],
        "geometry": [],
    }

    index = []

    for item in features:
        #data["eo:cloud_cover"].append(item["properties"]["eo:cloud_cover"])
        data["geometry"].append(shapely.geometry.shape(item["geometry"]))
        index.append(item["id"])

    items = geopandas.GeoDataFrame(data, index=index, geometry="geometry")#.sort_values("eo:cloud_cover")
    point_list = points.geometry.tolist()

    point_items = []
    for point in point_list:
        covered_by = items[items.covers(point)]
        if len(covered_by):
            point_items.append(features_d[covered_by.index[0]])
        else:
            # There weren't any scenes matching our conditions for this point (too cloudy)
            point_items.append(None)

    return labels.assign(stac_item=point_items)

In [186]:
type(lcn_dgdf)

dask_geopandas.core.GeoDataFrame

In [188]:
lcn_dgdf

Unnamed: 0_level_0,geometry,name
npartitions=20,Unnamed: 1_level_1,Unnamed: 2_level_1
,geometry,object
,...,...
...,...,...
,...,...
,...,...


In [193]:
def partitioned_geom_extract(labels):
    intersects = shapely.geometry.mapping(labels.unary_union.convex_hull)
    
    # print(intersects)

In [201]:
%%time

with ds_client(n_workers=16) as client:
    print(client.dashboard_link)
    df2 = lcn_dgdf.map_partitions(query).compute()

http://127.0.0.1:8787/status
{'type': 'Polygon', 'coordinates': (((0.0, 0.0), (0.0, 1.0), (1.0, 2.0), (1.0, 1.0), (0.0, 0.0)),)}


distributed.protocol.core - CRITICAL - Failed to deserialize
Traceback (most recent call last):
  File "/Users/kendallsmith/opt/anaconda3/envs/mlhub/lib/python3.9/site-packages/distributed/protocol/core.py", line 111, in loads
    return msgpack.loads(
  File "msgpack/_unpacker.pyx", line 194, in msgpack._cmsgpack.unpackb
  File "/Users/kendallsmith/opt/anaconda3/envs/mlhub/lib/python3.9/site-packages/distributed/protocol/core.py", line 103, in _decode_default
    return merge_and_deserialize(
  File "/Users/kendallsmith/opt/anaconda3/envs/mlhub/lib/python3.9/site-packages/distributed/protocol/serialize.py", line 488, in merge_and_deserialize
    return deserialize(header, merged_frames, deserializers=deserializers)
  File "/Users/kendallsmith/opt/anaconda3/envs/mlhub/lib/python3.9/site-packages/distributed/protocol/serialize.py", line 417, in deserialize
    return loads(header, frames)
  File "/Users/kendallsmith/opt/anaconda3/envs/mlhub/lib/python3.9/site-packages/distributed/protoc

KilledWorker: ("('from_pandas-374f41ae8f1baa2f218821dc52b86b9a', 12)", <WorkerState 'tcp://127.0.0.1:52221', name: 5, status: closed, memory: 0, processing: 4>)

In [215]:
shapely.geometry.mapping(lcn_gdf.iloc[0:1].unary_union)

{'type': 'Polygon',
 'coordinates': (((46.22372650315114, 7.637727360104802),
   (46.223792488589126, 7.66087904636043),
   (46.24699893998356, 7.660812489646023),
   (46.24693170406418, 7.637661006840825),
   (46.22372650315114, 7.637727360104802)),)}

In [216]:
search_start, search_end

('2018-01-01', '2018-12-31')

In [220]:
mlhub_catalog = ps_client.open(
    url=MLHUB_ROOT_URL,
    parameters={"key": MLHUB_API_KEY}, 
    ignore_conformance=True
)

In [231]:
%%time
# The time frame in which we search for non-cloudy imagery
search = mlhub_catalog.search(
    collections=[lcn_source_name],
    intersects=shapely.geometry.mapping(lcn_gdf.iloc[0:1].unary_union),
    datetime=[search_start, search_end],
    query={}, # "eo:cloud_cover": {"lt": 10}
    limit=10,
)

CPU times: user 5.13 ms, sys: 7.46 ms, total: 12.6 ms
Wall time: 10.8 ms


In [232]:
%%time
item_collection = search.get_all_items()

CPU times: user 160 ms, sys: 17.1 ms, total: 177 ms
Wall time: 3.32 s


In [230]:
type(item_collection)

pystac.item_collection.ItemCollection

In [None]:
ic = search.get_all_items_as_dict()

In [233]:
ic.keys()

dict_keys(['type', 'features'])

In [223]:
len(ic['features'])

73

In [227]:
ic['features'][0]

{'id': 'ref_landcovernet_v1_source_38NPP_05_20181231',
 'bbox': [46.22372650315114,
  7.637661006840825,
  46.24699893998356,
  7.66087904636043],
 'type': 'Feature',
 'links': [{'rel': 'collection',
   'type': 'application/json',
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source'},
  {'rel': 'parent',
   'type': 'application/json',
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source'},
  {'rel': 'root',
   'type': 'application/json',
   'href': 'http://api.radiant.earth/mlhub/v1/'},
  {'rel': 'self',
   'type': 'application/geo+json',
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_38NPP_05_20181231'},
  {'rel': 'alternate',
   'type': 'application/json',
   'title': 'tiles',
   'href': 'http://api.radiant.earth/mlhub/v1/collections/ref_landcovernet_v1_source/items/ref_landcovernet_v1_source_38NPP_05_20181231/tiles'}],
 'assets': {'B01': {'href': 'h