In [2]:
import leafmap
import geopandas as gpd
from geospatial_tools import DATA_DIR

## Base data

The USA polygon is base off 2018's `cb_2018_us_nation_5m` shapefile, taken from here: 
https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

It was then processed using QGIS to keep only the contiguous states, without any islands.

The Sentinel 2 grid was taken from the kml file found here: 
https://sentiwiki.copernicus.eu/web/s2-products

It was then processed using QGIS to keep only the grid cells that overlap with the 
contiguous states, meaning the polygon layer which is described just above.

Since our area of study is quite large, the `EPSG:5070` projection was chosen, as it
covers the whole area, introduces minimal distortion while preserving area.

In [3]:
USA_POLYGON_FILE = DATA_DIR / "usa/usa_polygon_5070.gpkg"
S2_USA_GRID_FILE = DATA_DIR / "usa/s2_grid_usa_polygon_5070.gpkg"

In [4]:
usa_polygon = gpd.read_file(USA_POLYGON_FILE)
s2_grid = gpd.read_file(S2_USA_GRID_FILE)

In [4]:
usa_polygon

Unnamed: 0,AFFGEOID,GEOID,NAME,geometry
0,0100000US,US,United States,"MULTIPOLYGON (((-2123555.702 3120381.564, -212..."


In [5]:
s2_grid

Unnamed: 0,name,folders,description,altitude,alt_mode,time_begin,time_end,time_when,geometry
0,12TUP,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-1386334.944 2487548.770 0.0...
1,12TYQ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-976300.478 2523767.452 0.00...
2,12TYR,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-960099.705 2622374.255 0.00...
3,12TYN,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-1008622.024 2325748.358 0.0...
4,12TYP,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-992478.385 2424861.340 0.00...
...,...,...,...,...,...,...,...,...,...
977,12TTM,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-1515431.586 2304192.826 0.0...
978,12TUK,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-1448525.813 2089886.667 0.0...
979,12TUQ,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-1371006.917 2586590.133 0.0...
980,12TUR,Features,TILE PROPERTIES<br><table border=0 cellpadding...,0.0,,,,,MULTIPOLYGON Z (((-1355793.563 2685354.080 0.0...


In [6]:
m = leafmap.Map(center=[40, -98], zoom=4)

# In blue, the USA polygon
m.add_gdf(usa_polygon, layer='usa')
# In red, the Sentinel 2 grid
m.add_gdf(s2_grid, layer='s2_grid', style={"color": "red"})

m

Map(center=[40, -98], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out_te…

## Creating our inference grid

From this, we want to create a grid of square polygons with which we will later on
query the [Planetary Computer](https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a)
Sentinel 2 dataset and clip the selected Sentinel 2 images.

In [7]:
import time
from geospatial_tools.vector import create_vector_grid_parallel, to_geopackage, select_polygons_by_location
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor

In [8]:
grid_size = 5000
bbox = usa_polygon.total_bounds

In [9]:
# start = time.time()
# print("Starting processing for [create_vector_grid_parallel]")
# grid_parallel = create_vector_grid_parallel(bounding_box=bbox, grid_size=grid_size, crs="EPSG:5070")
# stop = time.time()
# print(f"Printing len(grid_parallel) to check if grid contains same amount of polygons : {len(grid_parallel)}")
# print(f"Time taken to create parallel grid: {stop - start}")
# to_geopackage(gdf=grid_parallel, filename="polygon_grid.gpkg")

### Selecting the useful polygons

Now, since our grid was created using the extent of our input polygon (continental USA), we need to filter out the polygons that do not intersect with it.

Doing this in Python is not the most efficient way to do things, but since it's a step that shouldn't be done over and over, it's not that critical.

If ever you need to do this step in an efficient way because the data is just too big or too complex, it would be better off going through QGIS, PyGQIS, GDAL or 
some other more efficient way to do this operation. 

In [10]:
# start = time.time()
# print("Starting intersect selection using for loop")
# intersecting_polygons = select_polygons_by_location(grid_parallel, usa_polygon)
# stop = time.time()
# print(f"Time taken to intersect using for loop: {stop - start}")
# # Optionally, save to a new file

### Visualizing the selected polygons

This will take more or less time, depending on the number on polygons. 

In [11]:
# m.add_gdf(intersecting_polygons, layer='intersecting_polygons', style={"color": "blue"})
# m

## Making of list of all the S2 tiling grids we will be using

Now, we need to build our S2 tile grid list

In [5]:
s2_tile_grid_list = s2_grid["name"].to_list()
s2_tile_grid_list

['12TUP',
 '12TYQ',
 '12TYR',
 '12TYN',
 '12TYP',
 '12TYS',
 '12TYT',
 '11SMB',
 '11SMC',
 '11SLV',
 '11SMA',
 '11SMS',
 '12UUV',
 '11SMT',
 '11SMD',
 '11SMR',
 '12UUU',
 '11SNA',
 '12TWS',
 '11SNB',
 '12TWT',
 '11SMU',
 '12TWQ',
 '11SMV',
 '12TWR',
 '12TXM',
 '11SNS',
 '12TXN',
 '11SNC',
 '12TXK',
 '11SND',
 '12TXL',
 '11SKD',
 '12TXR',
 '12TXS',
 '11SKB',
 '12TXP',
 '11SKC',
 '12TXQ',
 '11SKU',
 '12TYL',
 '11SKV',
 '12TYM',
 '12TXT',
 '11SKT',
 '12TYK',
 '11SLC',
 '11SLD',
 '11SLA',
 '11SLB',
 '11SLT',
 '11SLU',
 '19TBF',
 '19TBG',
 '19TDM',
 '19TDN',
 '19TEJ',
 '19TEK',
 '19TEN',
 '19TEL',
 '19TEM',
 '18STE',
 '18STF',
 '19TCG',
 '18STC',
 '19TCH',
 '18STD',
 '18STJ',
 '19TCF',
 '19TCL',
 '18STG',
 '19TCM',
 '18STH',
 '19TCJ',
 '19TCK',
 '19TDF',
 '19TDG',
 '19TDK',
 '19TDL',
 '19TDJ',
 '16TCQ',
 '16TCR',
 '16TCN',
 '16TCP',
 '16TDK',
 '16TDL',
 '16TCS',
 '16TCT',
 '16TDP',
 '15RWQ',
 '16TDQ',
 '16TDM',
 '15RXN',
 '16TDN',
 '15RXP',
 '16TDT',
 '16TEK',
 '16TDR',
 '16TDS',
 '16SGB',


## Exploring S2 STAC catalog tools

In [2]:
from pathlib import Path
from geospatial_tools import DATA_DIR
from geospatial_tools.stac import StacSearch, PLANETARY_COMPUTER
from geospatial_tools.utils import create_date_range_for_specific_period
from geospatial_tools.raster import reproject_raster

In [3]:

start_year = 2020
end_year = 2024
start_month = 6
end_month = 8
date_ranges = create_date_range_for_specific_period(start_year=start_year, end_year=end_year, start_month_range=start_month, end_month_range=end_month)

search_client = StacSearch(PLANETARY_COMPUTER)

collection = "sentinel-2-l2a"
tile_ids = ["10SGE"]
query = {"eo:cloud_cover": {"lt": 1}, "s2:mgrs_tile": {"in": tile_ids}}
sortby = [{"field": "properties.eo:cloud_cover", "direction": "asc"}]

results = search_client.stac_api_search_for_date_ranges(date_ranges=date_ranges, collections=collection, query=query, sortby=sortby, max_items=20)

sorted_items = search_client.sort_results_by_cloud_coverage()
optimal_result = sorted_items[0]

print(f"Optimal result: {optimal_result.id}, {optimal_result.datetime}, {optimal_result.properties['eo:cloud_cover']}") 

[2024-08-02 12:19:42] INFO       [MainThread][geospatial_tools.stac] S2B_MSIL2A_20200827T184919_R113_T10SGE_20200907T081032, 2020-08-27 18:49:19.024000+00:00, 0.00465
[2024-08-02 12:19:42] INFO       [MainThread][geospatial_tools.stac] S2B_MSIL2A_20200814T183919_R070_T10SGE_20200815T163624, 2020-08-14 18:39:19.024000+00:00, 0.090959
[2024-08-02 12:19:42] INFO       [MainThread][geospatial_tools.stac] S2B_MSIL2A_20200807T184919_R113_T10SGE_20200815T094940, 2020-08-07 18:49:19.024000+00:00, 0.016416
[2024-08-02 12:19:42] INFO       [MainThread][geospatial_tools.stac] S2B_MSIL2A_20200728T184919_R113_T10SGE_20200817T225951, 2020-07-28 18:49:19.024000+00:00, 0.014629
[2024-08-02 12:19:42] INFO       [MainThread][geospatial_tools.stac] S2B_MSIL2A_20200718T184919_R113_T10SGE_20200816T163002, 2020-07-18 18:49:19.024000+00:00, 0.025615
[2024-08-02 12:19:42] INFO       [MainThread][geospatial_tools.stac] S2B_MSIL2A_20200715T183919_R070_T10SGE_20200912T054044, 2020-07-15 18:39:19.024000+00:00, 0.

In [4]:
bands = ["B02", "B03", "B04", "B08", "visual"]
file_base_path = Path(f"{DATA_DIR}/sentinel-2/test")
best_result = search_client.download_best_cloud_cover_results(bands=bands, base_directory=file_base_path)
best_result

[2024-08-02 12:19:46] INFO       [MainThread][geospatial_tools.stac] Downloading [S2B_MSIL2A_20220827T184919_R113_T10SGE_20220829T092449] ...
[2024-08-02 12:19:46] INFO       [MainThread][geospatial_tools.stac] Downloading B02 from https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/10/S/GE/2022/08/27/S2B_MSIL2A_20220827T184919_N0400_R113_T10SGE_20220829T092449.SAFE/GRANULE/L2A_T10SGE_A028596_20220827T190158/IMG_DATA/R10m/T10SGE_20220827T184919_B02_10m.tif?st=2024-08-01T16%3A19%3A42Z&se=2024-08-02T17%3A04%3A42Z&sp=rl&sv=2024-05-04&sr=c&skoid=9c8ff44a-6a2c-4dfb-b298-1c9212f64d9a&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2024-08-02T05%3A16%3A20Z&ske=2024-08-09T05%3A16%3A20Z&sks=b&skv=2024-05-04&sig=qFzOMTRZ2sI2MFptwzQUW9fG7iczz2K9zXpvb8CBcns%3D
[2024-08-02 12:19:49] INFO       [MainThread][geospatial_tools.utils] Downloaded /home/dev/projects/geospatial-tools/data/sentinel-2/test/S2B_MSIL2A_20220827T184919_R113_T10SGE_20220829T092449_B02.tif successfully.
[2024-08-02 12:19:49]

<geospatial_tools.stac.Asset at 0x7d9fe8d1cc90>

In [16]:
best_result.show_asset_items()

[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] Asset list for asset [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858] : 
['ID: [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858], Band: [B02], filename: [/home/dev/projects/geospatial-tools/data/sentinel-2/test/S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_B02.tif]', 'ID: [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858], Band: [B03], filename: [/home/dev/projects/geospatial-tools/data/sentinel-2/test/S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_B03.tif]', 'ID: [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858], Band: [B04], filename: [/home/dev/projects/geospatial-tools/data/sentinel-2/test/S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_B04.tif]', 'ID: [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858], Band: [B08], filename: [/home/dev/projects/geospatial-tools/data/sentinel-2/test/S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_B08.tif]', 'ID: [S2A

In [17]:
merged = best_result.merge_asset(delete_sub_items=True)
merged

[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] Calculated a total of [7] bands
[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] 7
[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] Creating merged asset metadata
[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] Merging asset [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858] ...
[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] Writing band image: S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858
[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] writing asset sub item band 1
[2024-08-01 16:59:18] INFO       [MainThread][geospatial_tools.stac] writing merged index band 1
[2024-08-01 16:59:19] INFO       [MainThread][geospatial_tools.stac] Writing band image: S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858
[2024-08-01 16:59:19] INFO       [MainThread][geospatial_tools.stac] writing asset sub item band 1
[

PosixPath('S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_merged.tif')

In [18]:
reprojected = best_result.reproject_merged_asset(target_projection=5070, delete_merged_asset=True)
reprojected

[2024-08-01 16:59:24] INFO       [MainThread][geospatial_tools.stac] Reprojecting asset [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858] ...
[2024-08-01 16:59:24] INFO       [MainThread][geospatial_tools.stac] Creating EPSG code from following input : [5070]
[2024-08-01 16:59:45] INFO       [MainThread][geospatial_tools.stac] Reprojected file created at S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_reprojected.tif
[2024-08-01 16:59:45] INFO       [MainThread][geospatial_tools.stac] Asset location : [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_reprojected.tif]
[2024-08-01 16:59:45] INFO       [MainThread][geospatial_tools.stac] Deleting merged asset file for [S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_merged.tif]


PosixPath('S2A_MSIL2A_20220723T184931_R113_T10SGE_20220725T180858_reprojected.tif')

In [7]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

# Finding the best image for each S2 tiling grid

In [8]:
def process_tile(tile):
    start_year = 2021
    end_year = 2023
    start_month = 6
    end_month = 8
    date_ranges = create_date_range_for_specific_period(
        start_year=start_year, end_year=end_year, start_month_range=start_month, end_month_range=end_month)

    search_client = StacSearch(PLANETARY_COMPUTER)
    
    collection = "sentinel-2-l2a"
    tile_ids = [tile]
    query = {"eo:cloud_cover": {"lt": 10}, "s2:mgrs_tile": {"in": tile_ids}}
    sortby = [{"field": "properties.eo:cloud_cover", "direction": "asc"}]
    
    results = search_client.stac_api_search_for_date_ranges(
        date_ranges=date_ranges, collections=collection, query=query, sortby=sortby, max_items=30)
    try:
        sorted_items = search_client.sort_results_by_cloud_coverage()
        optimal_result = sorted_items[0]
        return tile, optimal_result.id
    except Exception as error:
        print(error)
        return tile, f"error: {error}"
    

# Dictionary to store results
tile_dict = {}
for tile in s2_tile_grid_list:
    tile_dict[tile] = ""

tile_dict["errors"] = []

# Use ThreadPoolExecutor to process tiles in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
    future_to_tile = {executor.submit(process_tile, tile): tile for tile in s2_tile_grid_list}
    
    for future in as_completed(future_to_tile):
        tile, optimal_result_id = future.result()
        tile_dict[tile] = optimal_result_id
        if optimal_result_id.startswith("error:"):
            tile_dict["errors"].append(tile)

with open('data_lt10cv.json', 'w') as json_file:
    json.dump(tile_dict, json_file, indent=4)

print("Dictionary has been written to data.json")


[2024-08-02 09:17:29] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] S2B_MSIL2A_20210823T181919_R127_T12TUP_20210824T050919, 2021-08-23 18:19:19.024000+00:00, 0.205155
[2024-08-02 09:17:29] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] S2B_MSIL2A_20210813T181919_R127_T12TUP_20210814T072758, 2021-08-13 18:19:19.024000+00:00, 0.018086
[2024-08-02 09:17:29] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] S2B_MSIL2A_20210803T181919_R127_T12TUP_20210804T084440, 2021-08-03 18:19:19.024000+00:00, 0.060573
[2024-08-02 09:17:29] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] S2B_MSIL2A_20210724T181919_R127_T12TUP_20210725T073443, 2021-07-24 18:19:19.024000+00:00, 0.021798
[2024-08-02 09:17:29] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] S2B_MSIL2A_20210717T182919_R027_T12TUP_20210718T043012, 2021-07-17 18:29:19.024000+00:00, 0.036725
[2024-08-02 09:17:29] INFO       [ThreadPoolExecutor-0_0][geospatial_tools.stac] S2B_MSIL2A_20210714