In [None]:
import leafmap
import geopandas as gpd
from geospatial_tools import DATA_DIR
from geospatial_tools.planetary_computer.sentinel_2 import BestProductsForFeatures, download_and_process_sentinel2_asset
from geospatial_tools.raster import clip_raster_with_polygon
from geospatial_tools.stac import Asset
from geospatial_tools.utils import get_yaml_config, download_url, unzip_file
from geospatial_tools.vector import create_vector_grid_parallel, to_geopackage, select_polygons_by_location

## Base data

The USA polygon is base off 2018's `cb_2018_us_nation_20m` shapefile, taken from here: 
https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

It was then processed using QGIS to keep only the contiguous states, without any islands.

The Sentinel 2 grid was taken from the kml file found here: 
https://sentiwiki.copernicus.eu/web/s2-products

Below is some code to help with the download part.

### Downloading data
Let's download our source data

In [None]:
file_configs = get_yaml_config("data_file_links")
raw_usa_polygon_path = file_configs["united_states_polygon"]["url"]
raw_s2_tiling_grid_path = file_configs["sentinel_2_tiling_grid"]["url"]
download_list = {"raw_usa_polygon": raw_usa_polygon_path, "raw_s2_tiling_grid": raw_s2_tiling_grid_path}
file_list = [download_url(url=url, filename=f"{DATA_DIR}/{key}.zip") for key, url in download_list.items()]

file_list



In [None]:
[unzip_file(zip_path=f, extract_to=DATA_DIR) for f in file_list]

### Initial pre-processing

The above layers were processed using QGIS.

For the purpose of this analysis, only the contiguous lower 48 states have been conserved; smaller islands/land masses 
have also been striped.

The S2 tiling grid has been trimmed to keep only the grid cells that overlap with the 
contiguous states.

Since our area of study is quite large, the `EPSG:5070` projection was chosen, as it
covers the whole area, introduces minimal distortion while preserving area.

The files below have also been saved in this repository.

In [None]:
USA_POLYGON_FILE = DATA_DIR / "usa_polygon_5070.gpkg"
S2_USA_GRID_FILE = DATA_DIR / "s2_grid_usa_polygon_5070.gpkg"
usa_polygon = gpd.read_file(USA_POLYGON_FILE)
s2_grid = gpd.read_file(S2_USA_GRID_FILE)

In [None]:
usa_polygon

In [None]:
s2_grid

In [None]:
m = leafmap.Map(center=[40, -98], zoom=4)

# In blue, the USA polygon
m.add_gdf(usa_polygon, layer_name='usa')
# In red, the Sentinel 2 grid
m.add_gdf(s2_grid, layer_name='s2_grid', style={"color": "red"})

m

## Creating our grid

From this, we want to create a grid of square polygons with which we will later on
query the [Planetary Computer](https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a)
Sentinel 2 dataset and clip the selected Sentinel 2 images.

For the purpose of this notebook, the grid that will be created will use 10km by 10km squares to speed up 
processing.

In [None]:
grid_size = 5000
bbox = usa_polygon.total_bounds
grid_5km_filename = DATA_DIR / "polygon_grid_5km.gpkg"

In [None]:
print("Starting processing for [create_vector_grid_parallel]")
grid_5km = create_vector_grid_parallel(bounding_box=bbox, grid_size=grid_size, crs="EPSG:5070")
print(f"Printing len(grid_parallel) to check if grid contains same amount of polygons : {len(grid_5km)}")
to_geopackage(gdf=grid_5km, filename=grid_5km_filename)

In [None]:
grid_5km

### Selecting the useful polygons

Now, since our grid was created using the extent of our input polygon (continental USA), we need to filter out the polygons that do not intersect with it.

Doing this in Python is not the most efficient way to do things, but since it's a step that shouldn't be done over and over, it's not that critical.

If ever you need to do this step in an efficient way because the data is just too big or too complex, it would be better off going through QGIS, PyGQIS, GDAL or 
some other more efficient way to do this operation. 

In [None]:
intersecting_polygons_filename = DATA_DIR / "intersecting_polygon_grid_5km.gpkg"
print("Starting intersect selection")
intersecting_polygons = select_polygons_by_location(grid_5km, usa_polygon, num_of_workers=4)

# Optionally save to a new file
to_geopackage(intersecting_polygons, intersecting_polygons_filename)

In [None]:
intersecting_polygons

### Visualizing the selected polygons

This will take more or less time, depending on the number on polygons.

(Do not try this with a grid size smaller than 10000m)

In [None]:
### This takes a few minutes and navigation will be slow.
# Map is zoomed in to help with processing

m_intersecting_polygons = leafmap.Map(center=[39.7, -123], zoom=10)
m_intersecting_polygons.add_gdf(intersecting_polygons, layer_name='intersecting_polygons', style={"color": "blue"})
m_intersecting_polygons

## Data processing pipeline prototype

### Finding the best image for each S2 tiling grid

In [None]:
# This is the full list of S2 grids
s2_tile_grid_list = s2_grid["name"].to_list()
s2_tile_grid_list

In [None]:
# The list is a bit long, so we'll be continuing this notebook with the following subset
s2_tile_grid_subset_list = ["10TDK", "10TEK", "10SEJ", "10SDJ"]

### Building our processing list

First, let's make create a subselection of our dataset

In [None]:
S2_USA_GRID_FILE = DATA_DIR / "s2_grid_usa_polygon_5070.gpkg"
s2_grid = gpd.read_file(S2_USA_GRID_FILE)
s2_grid

In [None]:
# Creating our S2 grid tile subset

S2_USA_GRID_SUBSET_FILE = DATA_DIR / "s2_grid_usa_polygon_5070_subset.gpkg"
s2_grid_subset = s2_grid[s2_grid["name"].isin(s2_tile_grid_subset_list)]

# Optionally save to geopackage
to_geopackage(gdf=s2_grid_subset, filename=S2_USA_GRID_SUBSET_FILE)

In [None]:
s2_grid_subset

In [None]:
# Creating our polygon grid subset

intersect_polygons_subset_filename = DATA_DIR / "intersecting_polygon_grid_5km_subset.gpkg"
print("Starting intersect selection")
intersect_polygons_subset = select_polygons_by_location(intersecting_polygons, s2_grid_subset, predicate="within", num_of_workers=4)
# intersect_polygons_subset = gpd.read_file(intersect_polygons_subset_filename)

# Optionally save to a new file
to_geopackage(intersect_polygons_subset, intersect_polygons_subset_filename)

In [None]:
intersect_polygons_subset

### Finding the best products for our subset use case

In [None]:
# `s2_feature_name_columns` is the name of the column in `s2_grid_subset` where the id of
# the different tiles is found.
#
# `vector_column_name` is the name of the column in which the best results will be stored

s2_feature_name_columns = "name"
vector_column_name = "s2_tiles"

# Initiating our client
best_products_client = BestProductsForFeatures(sentinel2_tiling_grid=s2_grid_subset,
                                               sentinel2_tiling_grid_column=s2_feature_name_columns,
                                               vector_features=intersect_polygons_subset,
                                               vector_features_column=vector_column_name,
                                               max_cloud_cover=15)

In [None]:
# Executing the search
#
# This search look only for complete products, meaning products with less than
# 5 percent of nodata.

start_year = 2023
end_year = 2024
start_month = 6
end_month = 7

best_products_client.create_date_ranges(start_year, end_year, start_month, end_month)
products = best_products_client.find_best_complete_products()
products

In [23]:
# Selecting the best products for each vector tile
# This step is necessary as some of our vector polygons can be withing multiple S2 tiles.
# The best available S2 tile is therefore selected for each vector polygon.

best_results_path = DATA_DIR / "vector_tiles_with_s2tiles_subset.gpkg"
best_results = best_products_client.select_best_products_per_feature()
to_geopackage(best_results, best_results_path)

[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] Creating temporary UUID field for join operations
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] Starting process to find and identify contained features using spatial 'within' join operation
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] Grouping results
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] Cleaning and merging results
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] Spatial join operation is completed
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.planetary_computer.sentinel_2] Writing best product IDs to dataframe
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] Starting writing process
[2024-09-18 11:11:54] INFO       [MainThread][geospatial_tools.vector] File [/home/francispelletier/projects/geospatial_tools/data/vector_tiles_with_s2tiles_subset.gpkg] took 0.0313265323638

Unnamed: 0,geometry,feature_id,s2_tiles,best_s2_product_id
0,"POLYGON ((-2206113.743 2051469.316, -2201113.7...",5050b648-9b80-4d80-bd14-0b97a7e85e62,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
1,"POLYGON ((-2201113.743 2051469.316, -2196113.7...",a6f061b9-d63d-42fb-bc6a-b369aa7d6b84,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
2,"POLYGON ((-2196113.743 2051469.316, -2191113.7...",ee539417-f1f7-477c-917d-5d368956d0dc,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
3,"POLYGON ((-2221113.743 2056469.316, -2216113.7...",9cba3c1c-b226-422f-a771-2f9652cd2533,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
4,"POLYGON ((-2216113.743 2056469.316, -2211113.7...",aace4287-54da-48e5-84b2-b0d50aaae94d,[10SEJ],S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...
...,...,...,...,...
1515,"POLYGON ((-2306113.743 2291469.316, -2301113.7...",4625b360-b817-41f2-89c4-adc2ccfdfb05,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...
1516,"POLYGON ((-2301113.743 2291469.316, -2296113.7...",bbfcc13a-09af-4f70-bf2a-d47329c9ebdc,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...
1517,"POLYGON ((-2326113.743 2296469.316, -2321113.7...",0d2b91c0-65a6-476c-88b5-9f5ebbcda181,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...
1518,"POLYGON ((-2321113.743 2296469.316, -2316113.7...",5dd0b974-b735-4986-9c35-f52429eb8037,[10TDK],S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...


In [None]:
best_results

### Visualizing the results


In [24]:
best_products_client.successful_results

{'10SEJ': {'id': 'S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337',
  'cloud_cover': 0.000209,
  'no_data': 0.221346},
 '10SDJ': {'id': 'S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346',
  'cloud_cover': 0.006778,
  'no_data': 1.7e-05},
 '10TDK': {'id': 'S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412',
  'cloud_cover': 0.000707,
  'no_data': 1e-05}}

In [25]:
# We do, however, have one S2 grid missing
# No complete products where found for that tile.
# Therefore, it will have to be processed separately later, 
# by mosaicing different products together
#
# The consequence of this is that all vector polygons that are
# within this tile will not be processed at this time.

best_products_client.incomplete_results

['10TEK']

In [26]:
m_best_results = leafmap.Map(center=[39.7, -123], zoom=8)
m_best_results.add_gdf(s2_grid_subset, layer_name='s2_tiles', style={"color": "red"})
m_best_results.add_gdf(best_results, layer_name='vector_tiles_s2_grid', style={"color": "green"})
m_best_results

Map(center=[39.7, -123], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_out…

In [27]:
group_by_product = best_results.groupby("best_s2_product_id")["feature_id"].agg(list).reset_index()
group_by_product

Unnamed: 0,best_s2_product_id,feature_id
0,S2A_MSIL2A_20240705T185921_R013_T10SDJ_2024070...,"[6ee9ef65-f3ef-482a-89bd-04d4831adacb, 6b07adf..."
1,S2A_MSIL2A_20240705T185921_R013_T10TDK_2024070...,"[30285c16-5a1c-4441-a969-5bf1ea75af6b, 0d8d105..."
2,S2B_MSIL2A_20230703T184919_R113_T10SEJ_2023070...,"[5050b648-9b80-4d80-bd14-0b97a7e85e62, a6f061b..."


In [28]:
product_list = group_by_product["best_s2_product_id"].tolist()
product_list

['S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346',
 'S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412',
 'S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337']

### Downloading and processing Sentinel 2 products

#### Downloading and preparing Sentinel 2 products

In [29]:
product_asset_list = []
bands = ["B02", "B03", "B04", "B08", "visual"]
download_directory = DATA_DIR / "example_s2_download_and_processing"

for p in product_list:
    processed_product = download_and_process_sentinel2_asset(product_id=p, 
                                                             product_bands=bands,
                                                             base_directory=download_directory,
                                                             target_projection=5070)
                                                    
    product_asset_list.append(processed_product)

[2024-09-18 11:11:58] INFO       [MainThread][geospatial_tools.planetary_computer.sentinel_2] Reprojected file [/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_reprojected.tif] already exists
[2024-09-18 11:11:58] INFO       [MainThread][geospatial_tools.planetary_computer.sentinel_2] Reprojected file [/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412_reprojected.tif] already exists
[2024-09-18 11:11:58] INFO       [MainThread][geospatial_tools.planetary_computer.sentinel_2] Reprojected file [/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337_reprojected.tif] already exists


In [31]:
for p in product_asset_list:
    print(f"Asset ID : [{p.asset_id}]")
    print(f"Reprojected ID path : \n[{p.reprojected_asset_path}]\n")


Asset ID : [S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346]
Reprojected ID path : 
[/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_reprojected.tif]

Asset ID : [S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412]
Reprojected ID path : 
[/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/S2A_MSIL2A_20240705T185921_R013_T10TDK_20240706T050412_reprojected.tif]

Asset ID : [S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337]
Reprojected ID path : 
[/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/S2B_MSIL2A_20230703T184919_R113_T10SEJ_20230704T010337_reprojected.tif]



In [32]:
# Here, we are creating a new Asset object simply for convenience, from the printed outputs above

product = Asset(asset_id="S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346",
                reprojected_asset=download_directory / "S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_reprojected.tif", )

#### Creating a new geodataframe of all the vector polygons that are within our selected product

In [33]:
s2_product_id = product.asset_id
product_path = product.reprojected_asset_path
product_id_series = group_by_product[group_by_product["best_s2_product_id"] == s2_product_id]
# Since it's grouped by product id, there should always be only one row in the series
feature_ids = product_id_series["feature_id"].iloc[0]
vector_features = best_results[best_results["feature_id"].isin(feature_ids)]
vector_features_path = DATA_DIR / "vector_features.gpkg"
to_geopackage(vector_features, DATA_DIR / "vector_features.gpkg")

print(vector_features)

[2024-09-18 11:12:09] INFO       [MainThread][geospatial_tools.vector] Starting writing process
[2024-09-18 11:12:09] INFO       [MainThread][geospatial_tools.vector] File [/home/francispelletier/projects/geospatial_tools/data/vector_features.gpkg] took 0.023821353912353516 seconds to write.
                                              geometry  \
75   POLYGON ((-2311113.743 2081469.316, -2306113.7...   
76   POLYGON ((-2306113.743 2081469.316, -2301113.7...   
77   POLYGON ((-2301113.743 2081469.316, -2296113.7...   
78   POLYGON ((-2296113.743 2081469.316, -2291113.7...   
79   POLYGON ((-2291113.743 2081469.316, -2286113.7...   
..                                                 ...   
814  POLYGON ((-2331113.743 2186469.316, -2326113.7...   
815  POLYGON ((-2326113.743 2186469.316, -2321113.7...   
816  POLYGON ((-2321113.743 2186469.316, -2316113.7...   
817  POLYGON ((-2316113.743 2186469.316, -2311113.7...   
850  POLYGON ((-2331113.743 2191469.316, -2326113.7...   

          

#### Creating our Sentinel 2 "chips" by clipping main products with our vector polygon grid

In [34]:
clip_raster_with_polygon(raster_image=product_path,
                         polygon_layer=vector_features_path,
                         base_output_filename=s2_product_id,
                         output_dir=download_directory / "test_sentinel2_clip")

[2024-09-18 11:12:13] INFO       [MainThread][geospatial_tools.raster] Number of workers used: 16
[2024-09-18 11:12:13] INFO       [MainThread][geospatial_tools.raster] Output path : [/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/test_sentinel2_clip]
[2024-09-18 11:12:13] INFO       [MainThread][geospatial_tools.raster] Clipping raster image with 285 polygons
[2024-09-18 11:12:19] INFO       [MainThread][geospatial_tools.raster] Clipping process finished


[PosixPath('/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/test_sentinel2_clip/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_clipped_3.tif'),
 PosixPath('/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/test_sentinel2_clip/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_clipped_1.tif'),
 PosixPath('/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/test_sentinel2_clip/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_clipped_0.tif'),
 PosixPath('/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/test_sentinel2_clip/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_clipped_10.tif'),
 PosixPath('/home/francispelletier/projects/geospatial_tools/data/example_s2_download_and_processing/test_sentinel2_clip/S2A_MSIL2A_20240705T185921_R013_T10SDJ_20240706T050346_clipped_11.tif'),
 PosixPath('/home/francispelletie