In [3]:
import pandas as pd
import pyarrow.parquet as pq
from majortom import *

In [5]:
df = pq.read_table('../data/Major-TOM/metadata.parquet').to_pandas()
df['timestamp'] = pd.to_datetime(df.timestamp)
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.centre_lon, df.centre_lat), crs=df.crs.iloc[0]
)

In [6]:
gdf['grid_cell'].duplicated().sum() # There are duplicates in the original too

7197

In [7]:
gdf

Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,geometry
0,922D_249L,-922,-249,S2A_MSIL2A_20230119T161811_N0509_R111_T01CDJ_2...,2023-01-19 16:18:11,18.941737,0.000000,-82.770666,-178.200331,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,0,POINT (-178.2 -82.771)
1,922D_245L,-922,-245,S2B_MSIL2A_20181219T162339_N9999_R011_T01CEJ_2...,2018-12-19 16:23:39,22.742201,0.000000,-82.768451,-175.349546,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,1,POINT (-175.35 -82.768)
2,922D_244L,-922,-244,S2A_MSIL2A_20200119T155811_N9999_R025_T01CEJ_2...,2020-01-19 15:58:11,0.000000,0.000000,-82.767914,-174.636985,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,2,POINT (-174.637 -82.768)
3,922D_243L,-922,-243,S2A_MSIL2A_20210103T155811_N9999_R025_T01CEJ_2...,2021-01-03 15:58:11,3.769691,0.000000,-82.767385,-173.924477,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,3,POINT (-173.924 -82.767)
4,922D_242L,-922,-242,S2B_MSIL2A_20181220T155319_N9999_R025_T01CEJ_2...,2018-12-20 15:53:19,0.000000,0.000000,-82.766864,-173.212021,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,4,POINT (-173.212 -82.767)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245881,920U_112L,920,-112,S2B_MSIL2A_20220904T213039_N0400_R086_T17XNM_2...,2022-09-04 21:30:39,0.000000,0.000000,82.677347,-78.077006,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,495,POINT (-78.077 82.677)
2245882,920U_111L,920,-111,S2A_MSIL2A_20230417T213041_N0509_R086_T17XNM_2...,2023-04-17 21:30:41,0.000000,0.000000,82.676773,-77.372609,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,496,POINT (-77.373 82.677)
2245883,920U_110L,920,-110,S2A_MSIL2A_20200416T211031_N0500_R143_T17XNM_2...,2020-04-16 21:10:31,0.130192,0.000000,82.676194,-76.668268,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,497,POINT (-76.668 82.676)
2245884,920U_109L,920,-109,S2B_MSIL2A_20200816T210029_N0500_R100_T17XNM_2...,2020-08-16 21:00:29,0.002016,0.000252,82.675608,-75.963985,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,498,POINT (-75.964 82.676)


In [24]:
def combine_feathers(name, n)->pd.DataFrame:
    frames = []
    for i in range(n):
        frames.append(pd.read_feather(f"./metadata_with_forest_ratio_{name}_{i}.feather"))
    df = pd.concat(frames)
    df = df.drop(columns=['geometry'])
    df['timestamp'] = df['timestamp'].dt.strftime('%Y%m%dT%H%M%S')
    return df

In [26]:
colab_df = combine_feathers('colab', 101)
kaggle_df = combine_feathers('kaggle', 101)
local_df = combine_feathers('local', 501)
final_df = pd.concat([colab_df, local_df, kaggle_df])

In [27]:
print(final_df.shape)
print(final_df['grid_cell'].duplicated().sum()) #matches the original!
final_df.head()

(2245886, 13)
7197


Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,forest_ratio
0,922D_249L,-922,-249,S2A_MSIL2A_20230119T161811_N0509_R111_T01CDJ_2...,20230119T161811,18.941737,0.0,-82.770666,-178.200331,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,0,0.0
1,922D_245L,-922,-245,S2B_MSIL2A_20181219T162339_N9999_R011_T01CEJ_2...,20181219T162339,22.742201,0.0,-82.768451,-175.349546,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,1,0.0
2,922D_244L,-922,-244,S2A_MSIL2A_20200119T155811_N9999_R025_T01CEJ_2...,20200119T155811,0.0,0.0,-82.767914,-174.636985,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,2,0.0
3,922D_243L,-922,-243,S2A_MSIL2A_20210103T155811_N9999_R025_T01CEJ_2...,20210103T155811,3.769691,0.0,-82.767385,-173.924477,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,3,0.0
4,922D_242L,-922,-242,S2B_MSIL2A_20181220T155319_N9999_R025_T01CEJ_2...,20181220T155319,0.0,0.0,-82.766864,-173.212021,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,4,0.0


In [28]:
final_df.tail()

Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,forest_ratio
2245881,920U_112L,920,-112,S2B_MSIL2A_20220904T213039_N0400_R086_T17XNM_2...,20220904T213039,0.0,0.0,82.677347,-78.077006,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,495,0.0
2245882,920U_111L,920,-111,S2A_MSIL2A_20230417T213041_N0509_R086_T17XNM_2...,20230417T213041,0.0,0.0,82.676773,-77.372609,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,496,0.0
2245883,920U_110L,920,-110,S2A_MSIL2A_20200416T211031_N0500_R143_T17XNM_2...,20200416T211031,0.130192,0.0,82.676194,-76.668268,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,497,0.0
2245884,920U_109L,920,-109,S2B_MSIL2A_20200816T210029_N0500_R100_T17XNM_2...,20200816T210029,0.002016,0.000252,82.675608,-75.963985,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,498,0.0
2245885,920U_108L,920,-108,S2A_MSIL2A_20200722T210031_N9999_R100_T17XNM_2...,20200722T210031,0.0,0.0,82.675016,-75.259759,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,499,0.0


In [30]:
final_df.to_parquet('metadata_with_forest_ratio.parquet')

# Test

In [9]:
LOCAL_URL = 'metadata_with_forest_ratio.parquet'

In [11]:
df = pq.read_table(LOCAL_URL).to_pandas()
df['timestamp'] = pd.to_datetime(df.timestamp)
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.centre_lon, df.centre_lat), crs=df.crs.iloc[0]
)

In [12]:
gdf['grid_cell'].duplicated().sum() # There are duplicates in the original too

7197

In [13]:
gdf

Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,forest_ratio,geometry
0,922D_249L,-922,-249,S2A_MSIL2A_20230119T161811_N0509_R111_T01CDJ_2...,2023-01-19 16:18:11,18.941737,0.000000,-82.770666,-178.200331,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,0,0.0,POINT (-178.2 -82.771)
1,922D_245L,-922,-245,S2B_MSIL2A_20181219T162339_N9999_R011_T01CEJ_2...,2018-12-19 16:23:39,22.742201,0.000000,-82.768451,-175.349546,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,1,0.0,POINT (-175.35 -82.768)
2,922D_244L,-922,-244,S2A_MSIL2A_20200119T155811_N9999_R025_T01CEJ_2...,2020-01-19 15:58:11,0.000000,0.000000,-82.767914,-174.636985,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,2,0.0,POINT (-174.637 -82.768)
3,922D_243L,-922,-243,S2A_MSIL2A_20210103T155811_N9999_R025_T01CEJ_2...,2021-01-03 15:58:11,3.769691,0.000000,-82.767385,-173.924477,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,3,0.0,POINT (-173.924 -82.767)
4,922D_242L,-922,-242,S2B_MSIL2A_20181220T155319_N9999_R025_T01CEJ_2...,2018-12-20 15:53:19,0.000000,0.000000,-82.766864,-173.212021,EPSG:32701,https://huggingface.co/datasets/Major-TOM/Core...,4,0.0,POINT (-173.212 -82.767)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245881,920U_112L,920,-112,S2B_MSIL2A_20220904T213039_N0400_R086_T17XNM_2...,2022-09-04 21:30:39,0.000000,0.000000,82.677347,-78.077006,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,495,0.0,POINT (-78.077 82.677)
2245882,920U_111L,920,-111,S2A_MSIL2A_20230417T213041_N0509_R086_T17XNM_2...,2023-04-17 21:30:41,0.000000,0.000000,82.676773,-77.372609,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,496,0.0,POINT (-77.373 82.677)
2245883,920U_110L,920,-110,S2A_MSIL2A_20200416T211031_N0500_R143_T17XNM_2...,2020-04-16 21:10:31,0.130192,0.000000,82.676194,-76.668268,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,497,0.0,POINT (-76.668 82.676)
2245884,920U_109L,920,-109,S2B_MSIL2A_20200816T210029_N0500_R100_T17XNM_2...,2020-08-16 21:00:29,0.002016,0.000252,82.675608,-75.963985,EPSG:32617,https://huggingface.co/datasets/Major-TOM/Core...,498,0.0,POINT (-75.964 82.676)


In [14]:
gdf[gdf['grid_cell'] == '28D_270R']

Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,forest_ratio,geometry
700626,28D_270R,-28,270,S2A_MSIL2A_20190203T083141_N0213_R021_T35MKT_2...,2019-02-03 08:31:41,0.545228,0.308989,-2.469878,24.320742,EPSG:32735,https://huggingface.co/datasets/Major-TOM/Core...,172,0.902647,POINT (24.321 -2.47)


In [15]:
from shapely.geometry import box

# Example bounding boxes used for filtering
switzerland = box(5.9559111595,45.8179931641,10.4920501709,47.808380127)
gabon = box(8.1283659854,-4.9213919841,15.1618722208,2.7923006325)
napoli = box(14.091710578,40.7915558593,14.3723765416,40.9819258062)
pacific = box(-153.3922893485,39.6170415622,-152.0423077748,40.7090892316) # a remote patch over pacific - no data

In [19]:
filtered_df = filter_metadata(gdf,
                              cloud_cover = (0,10), # cloud cover between 0% and 10%
                              region=napoli, # you can try with different bounding boxes, like in the cell above
                              daterange=('2020-01-01', '2025-01-01'), # temporal range
                              nodata=(0.0,0.0) # only 0% of no data allowed,
                              )
filtered_df.head()

Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,forest_ratio,geometry
1593374,454U_120R,454,120,S2B_MSIL2A_20220719T095559_N0400_R122_T33TVF_2...,2022-07-19 09:55:59,0.0,0.0,40.823861,14.292709,EPSG:32633,https://huggingface.co/datasets/Major-TOM/Core...,455,0.003593,POINT (14.293 40.824)
1595266,455U_120R,455,120,S2B_MSIL2A_20220719T095559_N0400_R122_T33TVF_2...,2022-07-19 09:55:59,0.0,0.0,40.913671,14.311585,EPSG:32633,https://huggingface.co/datasets/Major-TOM/Core...,347,0.009702,POINT (14.312 40.914)
1595265,455U_119R,455,119,S2A_MSIL2A_20200113T095351_N0500_R079_T33TVF_2...,2020-01-13 09:53:51,0.0,0.0,40.913731,14.19273,EPSG:32633,https://huggingface.co/datasets/Major-TOM/Core...,346,0.051371,POINT (14.193 40.914)


In [22]:
out = read_row(filtered_df.iloc[0]) 

In [30]:
filtered_df = gdf[gdf['forest_ratio'] > 0.5]
filtered_df

Unnamed: 0,grid_cell,grid_row_u,grid_col_r,product_id,timestamp,cloud_cover,nodata,centre_lat,centre_lon,crs,parquet_url,parquet_row,forest_ratio,geometry
104161,617D_434L,-617,-434,S2A_MSIL2A_20230910T140731_N0509_R024_T19FEU_2...,2023-09-10 14:07:31,0.975957,0.0,-55.374000,-68.598392,EPSG:32719,https://huggingface.co/datasets/Major-TOM/Core...,174,0.540606,POINT (-68.598 -55.374)
104162,617D_433L,-617,-433,S2B_MSIL2A_20210401T140049_N0500_R067_T19FEU_2...,2021-04-01 14:00:49,3.975368,0.0,-55.373898,-68.440331,EPSG:32719,https://huggingface.co/datasets/Major-TOM/Core...,175,0.551271,POINT (-68.44 -55.374)
104285,616D_436L,-616,-436,S2B_MSIL2A_20200403T140729_N0500_R024_T19FEU_2...,2020-04-03 14:07:29,0.337710,0.0,-55.284285,-68.763563,EPSG:32719,https://huggingface.co/datasets/Major-TOM/Core...,298,0.540483,POINT (-68.764 -55.284)
104286,616D_435L,-616,-435,S2B_MSIL2A_20190820T140059_N9999_R067_T19FEU_2...,2019-08-20 14:00:59,0.000000,0.0,-55.284183,-68.605848,EPSG:32719,https://huggingface.co/datasets/Major-TOM/Core...,299,0.521604,POINT (-68.606 -55.284)
104287,616D_434L,-616,-434,S2A_MSIL2A_20230910T140731_N0509_R024_T19FEU_2...,2023-09-10 14:07:31,6.704225,0.0,-55.284083,-68.448133,EPSG:32719,https://huggingface.co/datasets/Major-TOM/Core...,300,0.503588,POINT (-68.448 -55.284)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2172517,799U_432R,799,432,S2B_MSIL2A_20200518T041549_N0500_R090_T51WWV_2...,2020-05-18 04:15:49,0.000000,0.0,71.810475,124.165049,EPSG:32651,https://huggingface.co/datasets/Major-TOM/Core...,130,0.696450,POINT (124.165 71.81)
2173401,800U_348R,800,348,S2B_MSIL2A_20220614T054639_N0400_R048_T47WNV_2...,2022-06-14 05:46:39,0.000000,0.0,71.900016,100.532078,EPSG:32647,https://huggingface.co/datasets/Major-TOM/Core...,14,0.603783,POINT (100.532 71.9)
2173476,800U_426R,800,426,S2A_MSIL2A_20220910T041601_N0400_R090_T51WVV_2...,2022-09-10 04:16:01,0.000000,1.0,71.901147,123.028542,EPSG:32651,https://huggingface.co/datasets/Major-TOM/Core...,89,0.616443,POINT (123.029 71.901)
2173477,800U_427R,800,427,S2B_MSIL2A_20210226T035719_N0500_R004_T51WWV_2...,2021-02-26 03:57:19,0.000000,0.0,71.900931,123.317691,EPSG:32651,https://huggingface.co/datasets/Major-TOM/Core...,90,0.710350,POINT (123.318 71.901)


In [35]:
filtered_df.iloc[0].name

104161

In [26]:
%timeit filter_download(filtered_df.iloc[0:1], by_row=True, local_dir='../data/', source_name='L2A')

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading and unpacking...:   0%|          | 0/1 [00:00<?, ?it/s]

7.14 s ± 478 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
ds = MajorTOM(filtered_df, '../data/L2A', tif_bands=['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12'])

In [75]:
ds[9]['B10']

KeyError: 'B10'