In [1]:
import leafmap
import geopandas as gpd
from geospatial_tools import DATA_DIR
from geospatial_tools.utils import get_yaml_config, download_url, unzip_file


## Base data

The USA polygon is base off 2018's `cb_2018_us_nation_20m` shapefile, taken from here: 
https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

It was then processed using QGIS to keep only the contiguous states, without any islands.

The Sentinel 2 grid was taken from the kml file found here: 
https://sentiwiki.copernicus.eu/web/s2-products

Below is some code to help with the download part.

### Downloading data
Let's download our source data

In [2]:
file_configs = get_yaml_config("data_file_links")
raw_usa_polygon_path = file_configs["united_states_polygon"]["url"]
raw_s2_tiling_grid_path = file_configs["sentinel_2_tiling_grid"]["url"]
download_list = {"raw_usa_polygon": raw_usa_polygon_path, "raw_s2_tiling_grid": raw_s2_tiling_grid_path}
file_list = [download_url(url=url, filename=f"{DATA_DIR}/{key}.zip") for key, url in download_list.items()]

file_list



[2024-09-05 13:52:28] INFO       [MainThread][geospatial_tools.utils] Yaml config file [/home/dev/projects/geospatial-tools/configs/data_file_links.yaml] found.
[2024-09-05 13:52:28] INFO       [MainThread][geospatial_tools.utils] Loading YAML config file [/home/dev/projects/geospatial-tools/configs/data_file_links.yaml].
[2024-09-05 13:52:28] INFO       [MainThread][geospatial_tools.utils] Downloaded /home/dev/projects/geospatial-tools/data/raw_usa_polygon.zip successfully.
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Downloaded /home/dev/projects/geospatial-tools/data/raw_s2_tiling_grid.zip successfully.


[PosixPath('/home/dev/projects/geospatial-tools/data/raw_usa_polygon.zip'),
 PosixPath('/home/dev/projects/geospatial-tools/data/raw_s2_tiling_grid.zip')]

In [3]:
[unzip_file(zip_path=f, extract_to=DATA_DIR) for f in file_list]

[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.shp.ea.iso.xml]
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.shp.iso.xml]
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.shp]
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.shx]
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.dbf]
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.prj]
[2024-09-05 13:52:31] INFO       [MainThread][geospatial_tools.utils] Extracted: [cb_2018_us_nation_20m.cpg]
[2024-09-05 13:52:32] INFO       [MainThread][geospatial_tools.utils] Extracted: [S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml]


[None, None]

### Initial pre-processing

The above layers were processed using QGIS.

For the purpose of this analysis, only the contiguous lower 48 states have been conserved; smaller islands/land masses 
have also been striped.

The S2 tiling grid has been trimmed to keep only the grid cells that overlap with the 
contiguous states.

Since our area of study is quite large, the `EPSG:5070` projection was chosen, as it
covers the whole area, introduces minimal distortion while preserving area.

The files below have also been saved in this repository.

In [4]:
USA_POLYGON_FILE = DATA_DIR / "usa_polygon_5070.gpkg"
S2_USA_GRID_FILE = DATA_DIR / "s2_grid_usa_polygon_5070.gpkg"
usa_polygon = gpd.read_file(USA_POLYGON_FILE)
s2_grid = gpd.read_file(S2_USA_GRID_FILE)

In [5]:
usa_polygon

Unnamed: 0,AFFGEOID,GEOID,NAME,geometry
0,0100000US,US,United States,"MULTIPOLYGON (((-2116048.733 3142966.552, -211..."


In [6]:
s2_grid

Unnamed: 0,name,folders,description,geometry
0,12TUP,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1386334.944 2487548.77 0, -..."
1,12TYQ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-976300.478 2523767.452 0, -..."
2,12TYR,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-960099.705 2622374.255 0, -..."
3,12TYN,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1008622.024 2325748.358 0, ..."
4,12TYP,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-992478.385 2424861.34 0, -8..."
...,...,...,...,...
977,12TTM,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1515431.586 2304192.826 0, ..."
978,12TUK,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1448525.813 2089886.667 0, ..."
979,12TUQ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1371006.917 2586590.133 0, ..."
980,12TUR,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1355793.563 2685354.08 0, -..."


In [7]:
m = leafmap.Map(center=[40, -98], zoom=4)

# In blue, the USA polygon
m.add_gdf(usa_polygon, layer_name='usa')
# In red, the Sentinel 2 grid
m.add_gdf(s2_grid, layer_name='s2_grid', style={"color": "red"})

m

Map(center=[40, -98], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_te…

## Creating our grid

From this, we want to create a grid of square polygons with which we will later on
query the [Planetary Computer](https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a)
Sentinel 2 dataset and clip the selected Sentinel 2 images.

For the purpose of this notebook, the grid that will be created will use 10km by 10km squares to speed up 
processing.

In [8]:
from geospatial_tools.vector import create_vector_grid_parallel, to_geopackage, select_polygons_by_location

In [9]:
grid_size = 5000
bbox = usa_polygon.total_bounds
grid_5km_filename = DATA_DIR / "polygon_grid_5km.gpkg"

In [10]:
print("Starting processing for [create_vector_grid_parallel]")
grid_5km = create_vector_grid_parallel(bounding_box=bbox, grid_size=grid_size, crs="EPSG:5070")
print(f"Printing len(grid_parallel) to check if grid contains same amount of polygons : {len(grid_5km)}")
to_geopackage(gdf=grid_5km, filename=grid_5km_filename)
grid_5km

Starting processing for [create_vector_grid_parallel]
[2024-09-05 13:52:34] INFO       [MainThread][geospatial_tools.vector] Creating grid coordinates for bounding box [[-2356113.74289801   301469.31619713  2258154.44089948  3165721.6501298 ]]
[2024-09-05 13:52:34] INFO       [MainThread][geospatial_tools.vector] Creating flattened grid coordinates
[2024-09-05 13:52:34] INFO       [MainThread][geospatial_tools.vector] Number of workers used: 4
[2024-09-05 13:52:34] INFO       [MainThread][geospatial_tools.vector] Allocating polygon array for [528879] polygons
[2024-09-05 13:52:34] INFO       [MainThread][geospatial_tools.vector] Creating polygons from chunks
[2024-09-05 13:52:42] INFO       [MainThread][geospatial_tools.vector] Managing properties
[2024-09-05 13:52:43] INFO       [MainThread][geospatial_tools.vector] Creating spatial index
[2024-09-05 13:52:43] INFO       [MainThread][geospatial_tools.vector] Generating polygon UUIDs
Printing len(grid_parallel) to check if grid contain

Unnamed: 0,geometry,feature_id
0,"POLYGON ((-2356113.743 301469.316, -2351113.74...",7c514565-eabb-4157-8bb5-46db81b684b1
1,"POLYGON ((-2351113.743 301469.316, -2346113.74...",4b2f235d-1a03-423c-b2ec-301ced3d1fba
2,"POLYGON ((-2346113.743 301469.316, -2341113.74...",902fc667-1222-420c-869d-f716c9429537
3,"POLYGON ((-2341113.743 301469.316, -2336113.74...",5e77f4a4-362e-4657-9314-270f84adb4ab
4,"POLYGON ((-2336113.743 301469.316, -2331113.74...",65ecbc35-d71b-421f-8d84-fefe0f20f8ff
...,...,...
528874,"POLYGON ((2233886.257 3161469.316, 2238886.257...",b7b65182-4be3-42af-8de1-98326839cb82
528875,"POLYGON ((2238886.257 3161469.316, 2243886.257...",621af9a4-88ef-44c5-9b11-0525a38d9d8f
528876,"POLYGON ((2243886.257 3161469.316, 2248886.257...",888100c4-4186-4c84-beb3-463472d6fa20
528877,"POLYGON ((2248886.257 3161469.316, 2253886.257...",fa76df8a-51f8-4711-b6ac-91fc194a3403


### Selecting the useful polygons

Now, since our grid was created using the extent of our input polygon (continental USA), we need to filter out the polygons that do not intersect with it.

Doing this in Python is not the most efficient way to do things, but since it's a step that shouldn't be done over and over, it's not that critical.

If ever you need to do this step in an efficient way because the data is just too big or too complex, it would be better off going through QGIS, PyGQIS, GDAL or 
some other more efficient way to do this operation. 

In [11]:
intersecting_polygons_filename = DATA_DIR / "intersecting_polygon_grid_5km.gpkg"
print("Starting intersect selection")
intersecting_polygons = select_polygons_by_location(grid_5km, usa_polygon, num_of_workers=4)

# Optionally save to a new file
to_geopackage(intersecting_polygons, intersecting_polygons_filename)
intersecting_polygons

Starting intersect selection
[2024-09-05 13:52:48] INFO       [MainThread][geospatial_tools.vector] Number of workers used: 4
[2024-09-05 13:53:09] INFO       [MainThread][geospatial_tools.vector] Concatenating results
[2024-09-05 13:53:09] INFO       [MainThread][geospatial_tools.vector] Creating spatial index
[2024-09-05 13:53:09] INFO       [MainThread][geospatial_tools.vector] Filtering columns of the results
[2024-09-05 13:53:09] INFO       [MainThread][geospatial_tools.vector] Starting writing process
[2024-09-05 13:53:11] INFO       [MainThread][geospatial_tools.vector] File [/home/dev/projects/geospatial-tools/data/intersecting_polygon_grid_5km.gpkg] took 2.000399112701416 seconds to write.


Unnamed: 0,geometry,feature_id
0,"POLYGON ((1513886.257 301469.316, 1518886.257 ...",7fe4f24c-b1cd-4011-b589-c501d0f65d82
1,"POLYGON ((1518886.257 301469.316, 1523886.257 ...",e96627be-6bfd-4bd0-b559-58a43cd224e4
2,"POLYGON ((1523886.257 301469.316, 1528886.257 ...",a7264cb8-6b5e-42fb-9502-77c1bb80c8f6
3,"POLYGON ((1528886.257 301469.316, 1533886.257 ...",b872b76d-20d8-429a-a40c-eafcdaaf1ec0
4,"POLYGON ((-146113.743 306469.316, -141113.743 ...",32de0909-5b49-4d0c-8ad5-cd263645ea9c
...,...,...
316126,"POLYGON ((-1966113.743 3161469.316, -1961113.7...",a3e74ac0-215b-4c55-9119-b0f2e8e377ec
316127,"POLYGON ((-1961113.743 3161469.316, -1956113.7...",f4716cbd-77f0-4872-aea2-04f9e1c8d58f
316128,"POLYGON ((-1956113.743 3161469.316, -1951113.7...",6d93dade-a115-425f-b0c6-21f00a5ad31b
316129,"POLYGON ((-1951113.743 3161469.316, -1946113.7...",9e40261c-dc6d-49d5-8cf5-9f84891bd19b


### Visualizing the selected polygons

This will take more or less time, depending on the number on polygons.

(Do not try this with a grid size smaller than 10000m)

In [12]:
### This takes a few minutes and navigation will be slow.
# Map is zoomed in to help with processing

m_intersecting_polygons = leafmap.Map(center=[39.7, -123], zoom=10)
m_intersecting_polygons.add_gdf(intersecting_polygons, layer_name='intersecting_polygons', style={"color": "blue"})
m_intersecting_polygons

Map(center=[39.7, -123], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out…

## Data processing pipeline prototype

### Finding the best image for each S2 tiling grid

In [13]:
from geospatial_tools.vector import to_geopackage, select_polygons_by_location


In [14]:
# This is the full list of S2 grids
s2_tile_grid_list = s2_grid["name"].to_list()
s2_tile_grid_list

['12TUP',
 '12TYQ',
 '12TYR',
 '12TYN',
 '12TYP',
 '12TYS',
 '12TYT',
 '11SMB',
 '11SMC',
 '11SLV',
 '11SMA',
 '11SMS',
 '12UUV',
 '11SMT',
 '11SMD',
 '11SMR',
 '12UUU',
 '11SNA',
 '12TWS',
 '11SNB',
 '12TWT',
 '11SMU',
 '12TWQ',
 '11SMV',
 '12TWR',
 '12TXM',
 '11SNS',
 '12TXN',
 '11SNC',
 '12TXK',
 '11SND',
 '12TXL',
 '11SKD',
 '12TXR',
 '12TXS',
 '11SKB',
 '12TXP',
 '11SKC',
 '12TXQ',
 '11SKU',
 '12TYL',
 '11SKV',
 '12TYM',
 '12TXT',
 '11SKT',
 '12TYK',
 '11SLC',
 '11SLD',
 '11SLA',
 '11SLB',
 '11SLT',
 '11SLU',
 '19TBF',
 '19TBG',
 '19TDM',
 '19TDN',
 '19TEJ',
 '19TEK',
 '19TEN',
 '19TEL',
 '19TEM',
 '18STE',
 '18STF',
 '19TCG',
 '18STC',
 '19TCH',
 '18STD',
 '18STJ',
 '19TCF',
 '19TCL',
 '18STG',
 '19TCM',
 '18STH',
 '19TCJ',
 '19TCK',
 '19TDF',
 '19TDG',
 '19TDK',
 '19TDL',
 '19TDJ',
 '16TCQ',
 '16TCR',
 '16TCN',
 '16TCP',
 '16TDK',
 '16TDL',
 '16TCS',
 '16TCT',
 '16TDP',
 '15RWQ',
 '16TDQ',
 '16TDM',
 '15RXN',
 '16TDN',
 '15RXP',
 '16TDT',
 '16TEK',
 '16TDR',
 '16TDS',
 '16SGB',


In [15]:
# The list is a bit long, so we'll be continuing this notebook with the following subset
s2_tile_grid_subset_list = ["10TDK", "10TEK", "10SEJ", "10SDJ"]

### Building our processing list

First, let's make create a subselection of our dataset

In [16]:
import geopandas as gpd
from geospatial_tools import DATA_DIR

In [17]:
S2_USA_GRID_FILE = DATA_DIR / "s2_grid_usa_polygon_5070.gpkg"
s2_grid = gpd.read_file(S2_USA_GRID_FILE)
s2_grid

Unnamed: 0,name,folders,description,geometry
0,12TUP,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1386334.944 2487548.77 0, -..."
1,12TYQ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-976300.478 2523767.452 0, -..."
2,12TYR,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-960099.705 2622374.255 0, -..."
3,12TYN,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1008622.024 2325748.358 0, ..."
4,12TYP,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-992478.385 2424861.34 0, -8..."
...,...,...,...,...
977,12TTM,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1515431.586 2304192.826 0, ..."
978,12TUK,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1448525.813 2089886.667 0, ..."
979,12TUQ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1371006.917 2586590.133 0, ..."
980,12TUR,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-1355793.563 2685354.08 0, -..."


In [18]:
# Creating our S2 grid tile subset

S2_USA_GRID_SUBSET_FILE = DATA_DIR / "s2_grid_usa_polygon_5070_subset.gpkg"
s2_grid_subset = s2_grid[s2_grid["name"].isin(s2_tile_grid_subset_list)]

# Optionally save to geopackage
to_geopackage(gdf=s2_grid_subset, filename=S2_USA_GRID_SUBSET_FILE)

s2_grid_subset

[2024-09-05 13:54:12] INFO       [MainThread][geospatial_tools.vector] Starting writing process
[2024-09-05 13:54:12] INFO       [MainThread][geospatial_tools.vector] File [/home/dev/projects/geospatial-tools/data/s2_grid_usa_polygon_5070_subset.gpkg] took 0.060523271560668945 seconds to write.


Unnamed: 0,name,folders,description,geometry
823,10SEJ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-2262059.689 2182514.584 0, ..."
841,10SDJ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-2357236.175 2210256.163 0, ..."
846,10TDK,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-2329019.777 2307113.875 0, ..."
857,10TEK,Features,TILE PROPERTIES<br><table border=0 cellpadding...,"MULTIPOLYGON Z (((-2233778.019 2279366.081 0, ..."


In [19]:
# Creating our polygon grid subset

intersect_polygons_subset_filename = DATA_DIR / "intersecting_polygon_grid_5km_subset.gpkg"
print("Starting intersect selection")
intersect_polygons_subset = select_polygons_by_location(intersecting_polygons, s2_grid_subset, predicate="within", num_of_workers=4)
# intersect_polygons_subset = gpd.read_file(intersect_polygons_subset_filename)

# Optionally save to a new file
to_geopackage(intersect_polygons_subset, intersect_polygons_subset_filename)
intersect_polygons_subset

Starting intersect selection
[2024-09-05 13:54:12] INFO       [MainThread][geospatial_tools.vector] Number of workers used: 4
[2024-09-05 13:54:13] INFO       [MainThread][geospatial_tools.vector] Concatenating results
[2024-09-05 13:54:13] INFO       [MainThread][geospatial_tools.vector] Creating spatial index
[2024-09-05 13:54:13] INFO       [MainThread][geospatial_tools.vector] Dropping duplicates
[2024-09-05 13:54:13] INFO       [MainThread][geospatial_tools.vector] Filtering columns of the results
[2024-09-05 13:54:13] INFO       [MainThread][geospatial_tools.vector] Starting writing process
[2024-09-05 13:54:13] INFO       [MainThread][geospatial_tools.vector] File [/home/dev/projects/geospatial-tools/data/intersecting_polygon_grid_5km_subset.gpkg] took 0.07275915145874023 seconds to write.


Unnamed: 0,geometry,feature_id
0,"POLYGON ((-2206113.743 2051469.316, -2201113.7...",bb4e47a5-7c45-4df8-9345-05e97aeaab3b
1,"POLYGON ((-2201113.743 2051469.316, -2196113.7...",2b092e12-da4a-4de6-aba3-fd7408d303db
2,"POLYGON ((-2196113.743 2051469.316, -2191113.7...",ffb57d63-01a2-498a-94e4-9f86e749dafd
3,"POLYGON ((-2221113.743 2056469.316, -2216113.7...",995e0a6d-16ed-480b-9bf9-5b592aa20289
4,"POLYGON ((-2216113.743 2056469.316, -2211113.7...",f5008c92-bd9d-40b5-a117-a8ad43d8fed6
...,...,...
1571,"POLYGON ((-2306113.743 2291469.316, -2301113.7...",4fe266a0-3548-4c19-98a7-cf43c52dc80c
1572,"POLYGON ((-2301113.743 2291469.316, -2296113.7...",4451dc44-5fdb-4bb4-b8c0-909185d5d3fc
1573,"POLYGON ((-2326113.743 2296469.316, -2321113.7...",704ffed9-5e22-4784-9bf1-8c05cfc3fc22
1574,"POLYGON ((-2321113.743 2296469.316, -2316113.7...",fede4d74-8f19-4acf-ba80-61b5e58676e4


### Finding the best products for our subset use case

In [20]:
from geospatial_tools.planetary_computer.sentinel_2 import BestProductsForFeatures, download_and_process_sentinel2_asset

In [21]:
# `s2_feature_name_columns` is the name of the column in `s2_grid_subset` where the id of
# the different tiles is found.
#
# `vector_column_name` is the name of the column in which the best results will be stored

s2_feature_name_columns = "name"
vector_column_name = "s2_tiles"

# Initiating our client
best_products_client = BestProductsForFeatures(sentinel2_tiling_grid=s2_grid_subset,
                                               sentinel2_tiling_grid_column=s2_feature_name_columns,
                                               vector_features=intersect_polygons_subset,
                                               vector_features_column=vector_column_name,
                                               max_cloud_cover=15)

In [22]:
# Executing the search
#
# This search look only for complete products, meaning products with less than
# 5 percent of nodata.

start_year = 2023
end_year = 2024
start_month = 6
end_month = 7

best_products_client.create_date_ranges(start_year, end_year, start_month, end_month)
products = best_products_client.find_best_complete_products()
products

[2024-09-05 13:54:14] INFO       [ThreadPoolExecutor-0_2][geospatial_tools.stac] Initiating STAC API search for the following date ranges : 
	[['2023-06-01T00:00:00Z/2023-07-31T23:59:59Z', '2024-06-01T00:00:00Z/2024-07-31T23:59:59Z'] 
	Query : [{'eo:cloud_cover': {'lt': 15}, 's2:mgrs_tile': {'in': ['10TDK']}}]
[2024-09-05 13:54:14] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] Initiating STAC API search for the following date ranges : 
	[['2023-06-01T00:00:00Z/2023-07-31T23:59:59Z', '2024-06-01T00:00:00Z/2024-07-31T23:59:59Z'] 
	Query : [{'eo:cloud_cover': {'lt': 15}, 's2:mgrs_tile': {'in': ['10SEJ']}}]
[2024-09-05 13:54:14] INFO       [ThreadPoolExecutor-0_3][geospatial_tools.stac] Initiating STAC API search for the following date ranges : 
	[['2023-06-01T00:00:00Z/2023-07-31T23:59:59Z', '2024-06-01T00:00:00Z/2024-07-31T23:59:59Z'] 
	Query : [{'eo:cloud_cover': {'lt': 15}, 's2:mgrs_tile': {'in': ['10TEK']}}]
[2024-09-05 13:54:14] INFO       [ThreadPoolExecutor-0_1][geospa

{'10SEJ': {'id': 'S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337',
  'cloud_cover': 0.000209,
  'no_data': 0.221346},
 '10SDJ': {'id': 'S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346',
  'cloud_cover': 0.006778,
  'no_data': 1.7e-05},
 '10TDK': {'id': 'S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412',
  'cloud_cover': 0.000707,
  'no_data': 1e-05}}

In [23]:
# Selecting the best products for each vector tile
# This step is necessary as some of our vector polygons can be withing multiple S2 tiles.
# The best available S2 tile is therefore selected for each vector polygon.

best_results_path = DATA_DIR / "vector_tiles_with_s2tiles_subset.gpkg"
best_results = best_products_client.select_best_products_per_feature()
to_geopackage(best_results, DATA_DIR / "vector_tiles_with_s2tiles_subset.gpkg")
best_results.to_crs(5070, inplace=True)
best_results


[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] Creating temporary UUID field for join operations
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] Starting process to find and identify contained features using spatial 'within' join operation
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] Grouping results
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] Cleaning and merging results
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] Spatial join operation is completed
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.planetary_computer.sentinel_2] Writing best product IDs to dataframe
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] Starting writing process
[2024-09-05 13:54:32] INFO       [MainThread][geospatial_tools.vector] File [/home/dev/projects/geospatial-tools/data/vector_tiles_with_s2tiles_subset.gpkg] took 0.06428122520446777 seconds 

Unnamed: 0,geometry,feature_id,s2_tiles,best_s2_product_id
0,"POLYGON ((-2206113.743 2051469.316, -2201113.7...",bb4e47a5-7c45-4df8-9345-05e97aeaab3b,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
1,"POLYGON ((-2201113.743 2051469.316, -2196113.7...",2b092e12-da4a-4de6-aba3-fd7408d303db,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
2,"POLYGON ((-2196113.743 2051469.316, -2191113.7...",ffb57d63-01a2-498a-94e4-9f86e749dafd,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
3,"POLYGON ((-2221113.743 2056469.316, -2216113.7...",995e0a6d-16ed-480b-9bf9-5b592aa20289,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
4,"POLYGON ((-2216113.743 2056469.316, -2211113.7...",f5008c92-bd9d-40b5-a117-a8ad43d8fed6,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
...,...,...,...,...
1515,"POLYGON ((-2306113.743 2291469.316, -2301113.7...",4fe266a0-3548-4c19-98a7-cf43c52dc80c,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...
1516,"POLYGON ((-2301113.743 2291469.316, -2296113.7...",4451dc44-5fdb-4bb4-b8c0-909185d5d3fc,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...
1517,"POLYGON ((-2326113.743 2296469.316, -2321113.7...",704ffed9-5e22-4784-9bf1-8c05cfc3fc22,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...
1518,"POLYGON ((-2321113.743 2296469.316, -2316113.7...",fede4d74-8f19-4acf-ba80-61b5e58676e4,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...


### Visualizing the results


In [24]:
best_products_client.successful_results

{'10SEJ': {'id': 'S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337',
  'cloud_cover': 0.000209,
  'no_data': 0.221346},
 '10SDJ': {'id': 'S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346',
  'cloud_cover': 0.006778,
  'no_data': 1.7e-05},
 '10TDK': {'id': 'S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412',
  'cloud_cover': 0.000707,
  'no_data': 1e-05}}

In [25]:
# We do, however, have one S2 grid missing
# No complete products where found for that tile.
# Therefore, it will have to be processed separately later, 
# by mosaicing different products together
#
# The consequence of this is that all vector polygons that are
# within this tile will not be processed at this time.

best_products_client.incomplete_results

['10TEK']

In [26]:
m_best_results = leafmap.Map(center=[39.7, -123], zoom=8)
m_best_results.add_gdf(s2_grid_subset, layer_name='s2_tiles', style={"color": "red"})
m_best_results.add_gdf(best_results, layer_name='vector_tiles_s2_grid', style={"color": "green"})
m_best_results

Map(center=[39.7, -123], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out…

In [27]:
group_by_product = grouped_gdf = best_results.groupby("best_s2_product_id")["feature_id"].agg(list).reset_index()
group_by_product

Unnamed: 0,best_s2_product_id,feature_id
0,S2A_MSIL2A_20240705T185921_R013_T10SDJ_2024070...,"[3965df09-34af-455c-bcb7-8faa56bdcee2, bd96455..."
1,S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...,"[428bed4f-05ac-4357-8bda-2317e50337dc, 2517904..."
2,S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...,"[bb4e47a5-7c45-4df8-9345-05e97aeaab3b, 2b092e1..."


In [28]:
product_list = group_by_product["best_s2_product_id"].tolist()
product_list

['S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346',
 'S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412',
 'S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337']

### Downloading and processing Sentinel 2 products

#### Downloading and preparing Sentinel 2 products

In [29]:
product_asset_list = []
bands = ["B02", "B03", "B04", "B08", "visual"]
download_directory = DATA_DIR / "example_s2_download_and_processing"

for p in product_list:
    processed_product = download_and_process_sentinel2_asset(product_id=p, 
                                                             product_bands=bands,
                                                             base_directory=download_directory,
                                                             target_projection=5070)
                                                    
    product_asset_list.append(processed_product)

In [30]:
from geospatial_tools.raster import clip_raster_with_polygon
from geospatial_tools.stac import Asset

In [31]:
for p in product_asset_list:
    print(f"Asset ID : [{p.asset_id}]")
    print(f"Reprojected ID path : \n[{p.reprojected_asset_path}]\n")


In [32]:
# Here, we are creating a new Asset object simply for convenience, from the printed outputs above

product = Asset(asset_id="S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346",
                reprojected_asset=download_directory / "S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_reprojected.tif", )

#### Creating a new geodataframe of all the vector polygons that are within our selected product

In [33]:
s2_product_id = product.asset_id
product_path = product.reprojected_asset_path
feature_ids = group_by_product[group_by_product["best_s2_product_id"] == s2_product_id]["feature_id"][0]

vector_features = best_results[best_results["feature_id"].isin(feature_ids)]
vector_features_path = DATA_DIR / "vector_features.gpkg"
to_geopackage(vector_features, DATA_DIR / "vector_features.gpkg")

print(vector_features)

#### Creating our Sentinel 2 "chips" by clipping main products with our vector polygon grid

In [34]:
clip_raster_with_polygon(raster_image=product_path, 
                         polygon_layer=vector_features_path, 
                         s2_tile_id=s2_product_id,
                         output_path=download_directory / "test_sentinel2_clip")