# Join the extractions DataFrames

In [3]:
from pathlib import Path
from tqdm import tqdm
import geopandas as gpd
import pandas as pd
import warnings

warnings.simplefilter('ignore')

required_columns = ["sample_id", "ref_id", "ewoc_code", "valid_time", "extract", "h3_l3_cell", "geometry"]
dataset_folder = Path("/vitodata/worldcereal_data/EXTRACTIONS/all_datasets/all_datasets_updated_flag/")
target_folder = Path("/vitodata/worldcereal_data/EXTRACTIONS/all_datasets/grouped_datasets")
all_datasets = list(dataset_folder.glob("*.geoparquet"))

all_datasets = sorted(all_datasets, key=lambda x: x.stem)

maximum_size_mb = 500 # 1.5 GB per dataset

dataframe_groups = []
group_memory_usage_mb = 0
group = []
group_idx = 1

for f_path in tqdm(all_datasets):
    # Make sure all the datasets are in the same CRS
    df = gpd.read_parquet(f_path, columns=required_columns)
    if df.crs is None:
        df.crs = "epsg:4326"
    else:
        df = df.to_crs(epsg=4326)
    # Only keep the relevant columns necessary for extractions
    # df = df[required_columns]
    # Remove all the datasets with land-cover and not crop-type
    df = df[(df.ewoc_code > 1100000000) & (df.ewoc_code < 1200000000)]

    group.append(df)
    group_memory_usage_mb += df.memory_usage(deep=True).sum() / 1e6
    print('After adding dataset: {} group size is: {:.1f} MB'.format(f_path.stem, group_memory_usage_mb))

    if (group_memory_usage_mb > maximum_size_mb) or (f_path == all_datasets[-1]):
        print('Writing group with size: ', group_memory_usage_mb, 'MB to disk with ', len(group), 'datasets')
        group = pd.concat(group, axis=0)
        group.to_parquet(
            target_folder / f"group_{group_idx}_updated_flag.geoparquet"
        )
        # Clear up-memory and reset variables
        del group
        group = []
        group_memory_usage_mb = 0
        group_idx += 1

  1%|          | 3/254 [00:00<00:29,  8.52it/s]

After adding dataset: 00_2019_KEN_NHI-CROP-HARVEST_POINT_100 group size is: 0.0 MB
After adding dataset: 00_2021_KEN_COPERNICUS-GEOGLAM-LR_POINT_111 group size is: 0.5 MB
After adding dataset: 01_2019_AF_One-Acre-Fund-MEL_POINT_110 group size is: 1.0 MB


  2%|▏         | 5/254 [00:00<00:27,  9.16it/s]

After adding dataset: 01_2019_KEN_RadiantEarth-01_POLY_101 group size is: 1.0 MB
After adding dataset: 01_2020_RWA_FAO-WAPOR-1_POINT_111 group size is: 1.3 MB
After adding dataset: 01_2021_KEN_COPERNICUS-GEOGLAM-SR_POINT_111 group size is: 1.8 MB


  4%|▎         | 9/254 [00:00<00:19, 12.72it/s]

After adding dataset: 02_2020_GO_NHI-CROP-HARVEST_POINT_100 group size is: 1.8 MB
After adding dataset: 03_2018_TZA_OSF-AFSIS_POINT_110 group size is: 2.0 MB
After adding dataset: 03_2019_TZA_OSF-AFSIS_POINT_110 group size is: 2.3 MB
After adding dataset: 03_2021_UGA_COPERNICUS-GEOGLAM-LR_POINT_111 group size is: 2.9 MB


  5%|▌         | 13/254 [00:01<00:16, 14.48it/s]

After adding dataset: 03_2021_UGA_COPERNICUS-GEOGLAM-SR_POINT_111 group size is: 3.7 MB
After adding dataset: 05_2021_TZA_COPERNICUS-GEOGLAM_POINT_110 group size is: 4.0 MB
After adding dataset: 07_2017_TZA_OSF-AFSIS_POINT_110 group size is: 4.1 MB
After adding dataset: 07_2022_IDN_vito-manual-points_POINT_100 group size is: 4.1 MB


  7%|▋         | 18/254 [00:01<00:13, 17.90it/s]

After adding dataset: 07_2023_IDN_vito-manual-points_POINT_100 group size is: 4.1 MB
After adding dataset: 08_2017_SSD_ESA-project-Sen2Agri_POINT_100 group size is: 4.1 MB
After adding dataset: 08_2017_SSD_ESA-project-Sen2Agri_POLY_100 group size is: 4.1 MB
After adding dataset: 08_2019_AF_DE-WA-VAL1_POINT_100 group size is: 4.1 MB
After adding dataset: 08_2019_AF_NHI-CROP-HARVEST_POLY_100 group size is: 4.1 MB


  9%|▉         | 24/254 [00:01<00:11, 20.63it/s]

After adding dataset: 08_2021_AF_DE-WA-TRAIN1_POLY_100 group size is: 4.1 MB
After adding dataset: 08_2022_AF_DE-WA-TRAIN1_POLY_100 group size is: 4.1 MB
After adding dataset: 09_2019_AF_DE-WA-TRAIN1_POLY_100 group size is: 4.1 MB
After adding dataset: 09_2019_AF_DE-WA-TRAIN2_POLY_100 group size is: 4.1 MB
After adding dataset: 09_2019_TZA_One-Acre-Fund-MEL_POINT_110 group size is: 4.3 MB


 11%|█         | 27/254 [00:01<00:10, 21.39it/s]

After adding dataset: 11_2019_AF_DE-WA-VAL2_POINT_100 group size is: 4.3 MB
After adding dataset: 11_2020_ETH_NHI-CROP-HARVEST_POLY_100 group size is: 4.3 MB
After adding dataset: 12_2021_AF_DE-WA-VAL1_POINT_100 group size is: 4.3 MB
After adding dataset: 12_2022_AF_DE-WA-VAL1_POINT_100 group size is: 4.3 MB
After adding dataset: 19_2021_MOZ_FAO-WAPOR-1_POLY_111 group size is: 4.3 MB


 12%|█▏        | 30/254 [00:02<00:16, 13.53it/s]

After adding dataset: 31_2018_GLO_EWOCO_POINT_100 group size is: 4.3 MB
After adding dataset: 31_2019_GLO_EWOCO_POINT_100 group size is: 4.3 MB


 13%|█▎        | 32/254 [00:02<00:28,  7.84it/s]

After adding dataset: 31_2020_GLO_EWOCO_POINT_100 group size is: 4.3 MB
After adding dataset: 41_2019_ESP_SIGPAC-Catalunya_POLY_111_part4 group size is: 15.5 MB


 13%|█▎        | 34/254 [00:09<03:39,  1.00it/s]

After adding dataset: 42_2019_ESP_SIGPAC-Catalunya_POLY_111_part1 group size is: 55.5 MB


 14%|█▍        | 35/254 [00:13<05:01,  1.38s/it]

After adding dataset: 42_2019_ESP_SIGPAC-Catalunya_POLY_111_part2 group size is: 81.2 MB


 14%|█▍        | 36/254 [00:17<06:50,  1.89s/it]

After adding dataset: 42_2019_ESP_SIGPAC-Catalunya_POLY_111_part3 group size is: 112.3 MB


 15%|█▍        | 37/254 [00:21<08:35,  2.38s/it]

After adding dataset: 43_2017_FRA_LPIS_POLY_110_part13 group size is: 141.9 MB


 15%|█▍        | 38/254 [00:25<09:22,  2.61s/it]

After adding dataset: 43_2017_FRA_LPIS_POLY_110_part26 group size is: 190.8 MB


 15%|█▌        | 39/254 [00:28<09:42,  2.71s/it]

After adding dataset: 43_2018_FRA_LPIS_POLY_110_part15 group size is: 225.2 MB


 16%|█▌        | 40/254 [00:31<10:03,  2.82s/it]

After adding dataset: 43_2018_FRA_LPIS_POLY_110_part20 group size is: 257.8 MB


 16%|█▌        | 41/254 [00:35<10:58,  3.09s/it]

After adding dataset: 43_2019_FRA_LPIS_POLY_110_part18 group size is: 308.0 MB


 17%|█▋        | 42/254 [00:41<14:12,  4.02s/it]

After adding dataset: 43_2020_ESP_Eurocrops-Navarre_POLY_110_part1 group size is: 308.7 MB


 17%|█▋        | 43/254 [00:51<19:40,  5.59s/it]

After adding dataset: 43_2020_ESP_Eurocrops-Navarre_POLY_110_part2 group size is: 309.5 MB


 17%|█▋        | 44/254 [00:54<17:02,  4.87s/it]

After adding dataset: 43_2020_FRA_LPIS_POLY_110_part12 group size is: 333.1 MB


 18%|█▊        | 45/254 [00:57<14:37,  4.20s/it]

After adding dataset: 43_2020_FRA_LPIS_POLY_110_part25 group size is: 370.6 MB


 18%|█▊        | 46/254 [01:01<14:57,  4.31s/it]

After adding dataset: 44_2017_FRA_LPIS_POLY_110_part14 group size is: 397.4 MB


 19%|█▊        | 47/254 [01:06<15:28,  4.48s/it]

After adding dataset: 44_2017_FRA_LPIS_POLY_110_part20 group size is: 432.4 MB
After adding dataset: 44_2017_FRA_LPIS_POLY_110_part22 group size is: 502.3 MB
Writing group with size:  502.325652 MB to disk with  48 datasets


 19%|█▉        | 49/254 [01:46<37:20, 10.93s/it]

After adding dataset: 44_2017_FRA_LPIS_POLY_110_part25 group size is: 47.9 MB


 20%|█▉        | 50/254 [01:50<29:29,  8.67s/it]

After adding dataset: 44_2018_FRA_LPIS_POLY_110_part13 group size is: 98.9 MB


 20%|██        | 51/254 [01:53<23:59,  7.09s/it]

After adding dataset: 44_2018_FRA_LPIS_POLY_110_part14 group size is: 144.4 MB


 20%|██        | 52/254 [01:56<20:03,  5.96s/it]

After adding dataset: 44_2018_FRA_LPIS_POLY_110_part18 group size is: 167.5 MB


 21%|██        | 53/254 [02:00<17:42,  5.28s/it]

After adding dataset: 44_2018_FRA_LPIS_POLY_110_part26 group size is: 210.6 MB


 21%|██▏       | 54/254 [02:03<15:43,  4.72s/it]

After adding dataset: 44_2018_FRA_LPIS_POLY_110_part29 group size is: 251.2 MB


 22%|██▏       | 55/254 [02:07<14:16,  4.31s/it]

After adding dataset: 44_2019_FRA_LPIS_POLY_110_part15 group size is: 302.1 MB


 22%|██▏       | 56/254 [02:13<15:56,  4.83s/it]

After adding dataset: 44_2019_FRA_LPIS_POLY_110_part17 group size is: 329.5 MB


 22%|██▏       | 57/254 [02:16<14:03,  4.28s/it]

After adding dataset: 44_2019_FRA_LPIS_POLY_110_part19 group size is: 363.4 MB


 23%|██▎       | 58/254 [02:19<13:25,  4.11s/it]

After adding dataset: 44_2019_FRA_LPIS_POLY_110_part20 group size is: 409.1 MB


 23%|██▎       | 59/254 [02:23<13:13,  4.07s/it]

After adding dataset: 44_2019_FRA_LPIS_POLY_110_part22 group size is: 432.5 MB
After adding dataset: 44_2020_FRA_LPIS_POLY_110_part15 group size is: 507.3 MB
Writing group with size:  507.34825599999994 MB to disk with  12 datasets


 24%|██▍       | 61/254 [02:59<31:38,  9.83s/it]

After adding dataset: 44_2020_FRA_LPIS_POLY_110_part24 group size is: 5.9 MB


 24%|██▍       | 62/254 [03:02<24:20,  7.61s/it]

After adding dataset: 44_2020_FRA_LPIS_POLY_110_part29 group size is: 44.8 MB


 25%|██▍       | 63/254 [03:05<20:29,  6.44s/it]

After adding dataset: 44_2020_FRA_LPIS_POLY_110_part5 group size is: 90.5 MB


 25%|██▌       | 64/254 [03:08<17:17,  5.46s/it]

After adding dataset: 44_2020_FRA_LPIS_POLY_110_part7 group size is: 117.3 MB


 26%|██▌       | 65/254 [03:12<14:53,  4.73s/it]

After adding dataset: 45_2017_FRA_LPIS_POLY_110_part15 group size is: 159.8 MB


 26%|██▌       | 66/254 [03:17<15:11,  4.85s/it]

After adding dataset: 45_2017_FRA_LPIS_POLY_110_part3 group size is: 179.1 MB


 26%|██▋       | 67/254 [03:20<13:21,  4.29s/it]

After adding dataset: 45_2018_FRA_LPIS_POLY_110_part16 group size is: 226.5 MB


 27%|██▋       | 68/254 [03:24<13:06,  4.23s/it]

After adding dataset: 45_2018_FRA_LPIS_POLY_110_part19 group size is: 246.1 MB


 27%|██▋       | 69/254 [03:27<12:30,  4.06s/it]

After adding dataset: 45_2018_FRA_LPIS_POLY_110_part3 group size is: 267.5 MB


 28%|██▊       | 70/254 [03:30<11:19,  3.69s/it]

After adding dataset: 45_2019_FRA_LPIS_POLY_110_part24 group size is: 284.5 MB


 28%|██▊       | 71/254 [03:33<10:06,  3.32s/it]

After adding dataset: 45_2019_FRA_LPIS_POLY_110_part6 group size is: 319.0 MB


 28%|██▊       | 72/254 [03:35<09:33,  3.15s/it]

After adding dataset: 45_2020_FRA_LPIS_POLY_110_part11 group size is: 347.0 MB


 29%|██▊       | 73/254 [03:41<12:03,  4.00s/it]

After adding dataset: 45_2020_FRA_LPIS_POLY_110_part17 group size is: 392.7 MB


 29%|██▉       | 74/254 [03:44<11:05,  3.70s/it]

After adding dataset: 45_2020_FRA_LPIS_POLY_110_part23 group size is: 404.8 MB


 30%|██▉       | 75/254 [03:48<11:10,  3.75s/it]

After adding dataset: 46_2017_FRA_LPIS_POLY_110_part16 group size is: 437.1 MB


 30%|██▉       | 76/254 [03:51<10:14,  3.45s/it]

After adding dataset: 46_2017_FRA_LPIS_POLY_110_part2 group size is: 466.0 MB


 30%|███       | 77/254 [03:55<10:32,  3.57s/it]

After adding dataset: 46_2017_FRA_LPIS_POLY_110_part23 group size is: 499.9 MB
After adding dataset: 46_2017_FRA_LPIS_POLY_110_part27 group size is: 529.8 MB
Writing group with size:  529.803806 MB to disk with  18 datasets


 31%|███       | 79/254 [04:26<25:07,  8.61s/it]

After adding dataset: 46_2017_FRA_LPIS_POLY_110_part6 group size is: 69.7 MB


 31%|███▏      | 80/254 [04:30<20:53,  7.20s/it]

After adding dataset: 46_2017_FRA_LPIS_POLY_110_part7 group size is: 100.9 MB


 32%|███▏      | 81/254 [04:34<18:09,  6.30s/it]

After adding dataset: 46_2018_FRA_LPIS_POLY_110_part12 group size is: 133.7 MB


 32%|███▏      | 82/254 [04:38<15:45,  5.49s/it]

After adding dataset: 46_2018_FRA_LPIS_POLY_110_part21 group size is: 167.6 MB


 33%|███▎      | 83/254 [04:42<14:23,  5.05s/it]

After adding dataset: 46_2018_FRA_LPIS_POLY_110_part23 group size is: 195.7 MB


 33%|███▎      | 84/254 [04:46<13:53,  4.91s/it]

After adding dataset: 46_2018_FRA_LPIS_POLY_110_part6 group size is: 257.7 MB


 33%|███▎      | 85/254 [04:50<13:04,  4.64s/it]

After adding dataset: 46_2018_FRA_LPIS_POLY_110_part7 group size is: 287.1 MB


 34%|███▍      | 86/254 [04:53<11:43,  4.19s/it]

After adding dataset: 46_2019_FRA_LPIS_POLY_110_part23 group size is: 308.5 MB


 34%|███▍      | 87/254 [04:57<11:13,  4.03s/it]

After adding dataset: 46_2019_FRA_LPIS_POLY_110_part26 group size is: 332.4 MB


 35%|███▍      | 88/254 [05:01<11:24,  4.12s/it]

After adding dataset: 46_2019_FRA_LPIS_POLY_110_part29 group size is: 359.8 MB


 35%|███▌      | 89/254 [05:04<10:31,  3.83s/it]

After adding dataset: 46_2019_FRA_LPIS_POLY_110_part4 group size is: 382.1 MB


 35%|███▌      | 90/254 [05:09<11:16,  4.13s/it]

After adding dataset: 46_2019_FRA_LPIS_POLY_110_part8 group size is: 411.7 MB


 36%|███▌      | 91/254 [05:15<12:14,  4.50s/it]

After adding dataset: 46_2019_FRA_LPIS_POLY_110_part9 group size is: 468.5 MB
After adding dataset: 46_2020_FRA_LPIS_POLY_110_part2 group size is: 512.2 MB
Writing group with size:  512.1939349999999 MB to disk with  14 datasets


 37%|███▋      | 93/254 [05:54<28:52, 10.76s/it]

After adding dataset: 46_2020_FRA_LPIS_POLY_110_part26 group size is: 39.3 MB


 37%|███▋      | 94/254 [05:57<22:42,  8.52s/it]

After adding dataset: 46_2020_FRA_LPIS_POLY_110_part28 group size is: 95.9 MB


 37%|███▋      | 95/254 [06:02<19:37,  7.41s/it]

After adding dataset: 46_2020_FRA_LPIS_POLY_110_part3 group size is: 143.8 MB


 38%|███▊      | 96/254 [06:05<16:10,  6.14s/it]

After adding dataset: 46_2020_FRA_LPIS_POLY_110_part4 group size is: 205.5 MB


 38%|███▊      | 97/254 [06:11<15:59,  6.11s/it]

After adding dataset: 46_2020_SVN_LPIS_POLY_110_part1 group size is: 250.7 MB


 39%|███▊      | 98/254 [06:16<15:15,  5.87s/it]

After adding dataset: 46_2020_SVN_LPIS_POLY_110_part2 group size is: 296.9 MB


 39%|███▉      | 99/254 [06:21<14:38,  5.67s/it]

After adding dataset: 47_2017_FRA_LPIS_POLY_110_part19 group size is: 352.5 MB


 39%|███▉      | 100/254 [06:26<13:33,  5.28s/it]

After adding dataset: 47_2017_FRA_LPIS_POLY_110_part4 group size is: 389.5 MB


 40%|███▉      | 101/254 [06:30<12:35,  4.94s/it]

After adding dataset: 47_2017_FRA_LPIS_POLY_110_part9 group size is: 427.1 MB


 40%|████      | 102/254 [06:34<11:33,  4.56s/it]

After adding dataset: 47_2018_FRA_LPIS_POLY_110_part17 group size is: 491.7 MB
After adding dataset: 47_2018_FRA_LPIS_POLY_110_part25 group size is: 547.2 MB
Writing group with size:  547.238462 MB to disk with  11 datasets


 41%|████      | 104/254 [07:02<21:24,  8.57s/it]

After adding dataset: 47_2018_FRA_LPIS_POLY_110_part28 group size is: 50.8 MB


 41%|████▏     | 105/254 [07:07<18:01,  7.26s/it]

After adding dataset: 47_2018_FRA_LPIS_POLY_110_part4 group size is: 78.9 MB


 42%|████▏     | 106/254 [07:10<14:43,  5.97s/it]

After adding dataset: 47_2019_FRA_LPIS_POLY_110_part10 group size is: 116.2 MB


 42%|████▏     | 107/254 [07:13<12:43,  5.19s/it]

After adding dataset: 47_2019_FRA_LPIS_POLY_110_part13 group size is: 170.0 MB


 43%|████▎     | 108/254 [07:20<13:59,  5.75s/it]

After adding dataset: 47_2019_FRA_LPIS_POLY_110_part2 group size is: 203.5 MB


 43%|████▎     | 109/254 [07:24<12:19,  5.10s/it]

After adding dataset: 47_2019_FRA_LPIS_POLY_110_part27 group size is: 245.3 MB


 43%|████▎     | 110/254 [07:27<10:47,  4.50s/it]

After adding dataset: 47_2019_FRA_LPIS_POLY_110_part28 group size is: 314.0 MB


 44%|████▎     | 111/254 [07:32<11:37,  4.88s/it]

After adding dataset: 47_2020_AUT_LPIS_POLY_110_part7 group size is: 338.7 MB


 44%|████▍     | 112/254 [07:37<11:38,  4.92s/it]

After adding dataset: 47_2020_FRA_LPIS_POLY_110_part1 group size is: 369.8 MB


 44%|████▍     | 113/254 [07:41<10:23,  4.42s/it]

After adding dataset: 47_2020_FRA_LPIS_POLY_110_part13 group size is: 418.7 MB


 45%|████▍     | 114/254 [07:43<08:48,  3.77s/it]

After adding dataset: 47_2020_FRA_LPIS_POLY_110_part18 group size is: 453.3 MB


 45%|████▌     | 115/254 [07:46<08:11,  3.54s/it]

After adding dataset: 47_2020_FRA_LPIS_POLY_110_part20 group size is: 478.0 MB


 46%|████▌     | 116/254 [07:51<09:08,  3.98s/it]

After adding dataset: 47_2021_AUT_LPIS_POLY_110_part5 group size is: 482.1 MB
After adding dataset: 47_2021_AUT_LPIS_POLY_110_part7 group size is: 525.3 MB
Writing group with size:  525.321588 MB to disk with  14 datasets


 46%|████▋     | 118/254 [08:23<20:30,  9.05s/it]

After adding dataset: 47_2021_AUT_LPIS_POLY_110_part8 group size is: 14.0 MB


 47%|████▋     | 119/254 [08:28<17:40,  7.86s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part1 group size is: 54.3 MB


 47%|████▋     | 120/254 [08:33<15:55,  7.13s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part2 group size is: 88.3 MB


 48%|████▊     | 121/254 [08:38<14:27,  6.53s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part3 group size is: 120.2 MB


 48%|████▊     | 122/254 [08:43<13:24,  6.09s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part4 group size is: 151.7 MB


 48%|████▊     | 123/254 [08:49<12:43,  5.83s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part5 group size is: 184.7 MB


 49%|████▉     | 124/254 [08:55<12:50,  5.93s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part6 group size is: 217.8 MB


 49%|████▉     | 125/254 [09:02<13:28,  6.27s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part7 group size is: 256.5 MB


 50%|████▉     | 126/254 [09:08<13:40,  6.41s/it]

After adding dataset: 48_2017_AUT_LPIS_POLY_110_part8 group size is: 290.2 MB


 50%|█████     | 127/254 [09:19<16:24,  7.76s/it]

After adding dataset: 48_2017_FRA_LPIS_POLY_110_part1 group size is: 370.5 MB


 50%|█████     | 128/254 [09:26<15:19,  7.30s/it]

After adding dataset: 48_2017_FRA_LPIS_POLY_110_part10 group size is: 421.6 MB
After adding dataset: 48_2017_FRA_LPIS_POLY_110_part18 group size is: 506.8 MB
Writing group with size:  506.817807 MB to disk with  12 datasets


 51%|█████     | 130/254 [09:57<22:02, 10.67s/it]

After adding dataset: 48_2017_FRA_LPIS_POLY_110_part28 group size is: 51.1 MB


 52%|█████▏    | 131/254 [10:02<18:08,  8.85s/it]

After adding dataset: 48_2017_FRA_LPIS_POLY_110_part5 group size is: 143.9 MB


 52%|█████▏    | 132/254 [10:06<14:43,  7.24s/it]

After adding dataset: 48_2017_FRA_LPIS_POLY_110_part8 group size is: 195.5 MB


 52%|█████▏    | 133/254 [10:11<13:28,  6.68s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part1 group size is: 227.3 MB


 53%|█████▎    | 134/254 [10:16<12:17,  6.15s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part2 group size is: 258.6 MB


 53%|█████▎    | 135/254 [10:26<14:25,  7.27s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part3 group size is: 306.5 MB


 54%|█████▎    | 136/254 [10:34<14:43,  7.49s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part4 group size is: 353.9 MB


 54%|█████▍    | 137/254 [10:40<14:06,  7.23s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part5 group size is: 406.2 MB


 54%|█████▍    | 138/254 [10:46<12:55,  6.69s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part6 group size is: 437.8 MB


 55%|█████▍    | 139/254 [10:51<12:11,  6.36s/it]

After adding dataset: 48_2018_AUT_LPIS_POLY_110_part7 group size is: 469.7 MB


 55%|█████▌    | 140/254 [10:52<08:40,  4.57s/it]

After adding dataset: 48_2018_EU_LUCAS_POINT_110 group size is: 475.1 MB
After adding dataset: 48_2018_FRA_LPIS_POLY_110_part1 group size is: 529.6 MB
Writing group with size:  529.6264679999999 MB to disk with  12 datasets


 56%|█████▌    | 142/254 [11:21<15:52,  8.51s/it]

After adding dataset: 48_2018_FRA_LPIS_POLY_110_part11 group size is: 24.1 MB


 56%|█████▋    | 143/254 [11:25<13:32,  7.32s/it]

After adding dataset: 48_2018_FRA_LPIS_POLY_110_part2 group size is: 75.7 MB


 57%|█████▋    | 144/254 [11:27<10:33,  5.76s/it]

After adding dataset: 48_2018_FRA_LPIS_POLY_110_part22 group size is: 105.2 MB


 57%|█████▋    | 145/254 [11:31<09:09,  5.04s/it]

After adding dataset: 48_2018_FRA_LPIS_POLY_110_part24 group size is: 177.2 MB


 57%|█████▋    | 146/254 [11:34<07:56,  4.41s/it]

After adding dataset: 48_2018_FRA_LPIS_POLY_110_part30 group size is: 220.5 MB


 58%|█████▊    | 147/254 [11:37<07:33,  4.24s/it]

After adding dataset: 48_2018_FRA_LPIS_POLY_110_part9 group size is: 266.7 MB


 58%|█████▊    | 148/254 [11:42<07:44,  4.38s/it]

After adding dataset: 48_2019_AUT_LPIS_POLY_110_part1 group size is: 297.5 MB


 59%|█████▊    | 149/254 [11:46<07:31,  4.30s/it]

After adding dataset: 48_2019_AUT_LPIS_POLY_110_part2 group size is: 328.8 MB


 59%|█████▉    | 150/254 [11:53<08:46,  5.06s/it]

After adding dataset: 48_2019_AUT_LPIS_POLY_110_part3 group size is: 373.5 MB


 59%|█████▉    | 151/254 [12:01<10:03,  5.86s/it]

After adding dataset: 48_2019_AUT_LPIS_POLY_110_part4 group size is: 420.5 MB


 60%|█████▉    | 152/254 [12:08<10:22,  6.10s/it]

After adding dataset: 48_2019_AUT_LPIS_POLY_110_part5 group size is: 465.9 MB


 60%|██████    | 153/254 [12:13<09:48,  5.82s/it]

After adding dataset: 48_2019_AUT_LPIS_POLY_110_part6 group size is: 496.9 MB
After adding dataset: 48_2019_AUT_LPIS_POLY_110_part7 group size is: 528.9 MB
Writing group with size:  528.874094 MB to disk with  13 datasets


 61%|██████    | 155/254 [12:53<19:55, 12.08s/it]

After adding dataset: 48_2019_FRA_LPIS_POLY_110_part1 group size is: 70.9 MB


 61%|██████▏   | 156/254 [12:57<15:49,  9.69s/it]

After adding dataset: 48_2019_FRA_LPIS_POLY_110_part12 group size is: 115.6 MB


 62%|██████▏   | 157/254 [13:00<12:38,  7.82s/it]

After adding dataset: 48_2019_FRA_LPIS_POLY_110_part21 group size is: 154.1 MB


 62%|██████▏   | 158/254 [13:04<10:26,  6.53s/it]

After adding dataset: 48_2019_FRA_LPIS_POLY_110_part3 group size is: 193.9 MB


 63%|██████▎   | 159/254 [13:09<09:54,  6.25s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part1 group size is: 225.2 MB


 63%|██████▎   | 160/254 [13:15<09:36,  6.13s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part2 group size is: 252.8 MB


 63%|██████▎   | 161/254 [13:24<10:29,  6.76s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part3 group size is: 298.2 MB


 64%|██████▍   | 162/254 [13:31<10:52,  7.09s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part4 group size is: 342.7 MB


 64%|██████▍   | 163/254 [13:37<10:15,  6.76s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part5 group size is: 371.2 MB


 65%|██████▍   | 164/254 [13:42<09:23,  6.26s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part6 group size is: 399.8 MB


 65%|██████▍   | 165/254 [13:48<08:49,  5.95s/it]

After adding dataset: 48_2020_AUT_LPIS_POLY_110_part8 group size is: 429.0 MB


 65%|██████▌   | 166/254 [13:51<07:37,  5.19s/it]

After adding dataset: 48_2020_FRA_LPIS_POLY_110_part10 group size is: 469.9 MB
After adding dataset: 48_2020_FRA_LPIS_POLY_110_part14 group size is: 581.1 MB
Writing group with size:  581.149679 MB to disk with  13 datasets


 66%|██████▌   | 168/254 [14:31<16:09, 11.28s/it]

After adding dataset: 48_2020_FRA_LPIS_POLY_110_part21 group size is: 45.5 MB


 67%|██████▋   | 169/254 [14:34<12:45,  9.01s/it]

After adding dataset: 48_2020_FRA_LPIS_POLY_110_part22 group size is: 81.2 MB


 67%|██████▋   | 170/254 [14:39<10:51,  7.76s/it]

After adding dataset: 48_2020_FRA_LPIS_POLY_110_part27 group size is: 113.2 MB


 67%|██████▋   | 171/254 [14:44<09:35,  6.93s/it]

After adding dataset: 48_2020_FRA_LPIS_POLY_110_part6 group size is: 180.2 MB


 68%|██████▊   | 172/254 [14:55<11:14,  8.22s/it]

After adding dataset: 48_2020_FRA_LPIS_POLY_110_part8 group size is: 255.8 MB


 68%|██████▊   | 173/254 [14:58<08:49,  6.54s/it]

After adding dataset: 48_2021_AUT_LPIS_POLY_110_part1 group size is: 319.4 MB


 69%|██████▊   | 174/254 [15:01<07:26,  5.58s/it]

After adding dataset: 48_2021_AUT_LPIS_POLY_110_part2 group size is: 336.4 MB


 69%|██████▉   | 175/254 [15:04<06:19,  4.81s/it]

After adding dataset: 48_2021_AUT_LPIS_POLY_110_part3 group size is: 369.6 MB


 69%|██████▉   | 176/254 [15:09<06:14,  4.80s/it]

After adding dataset: 48_2021_AUT_LPIS_POLY_110_part4 group size is: 405.2 MB


 70%|██████▉   | 177/254 [15:15<06:23,  4.99s/it]

After adding dataset: 48_2021_AUT_LPIS_POLY_110_part6 group size is: 430.9 MB


 70%|███████   | 178/254 [15:19<06:13,  4.91s/it]

After adding dataset: 48_2021_AUT_LPIS_POLY_110_part9 group size is: 452.5 MB
After adding dataset: 49_2017_FRA_LPIS_POLY_110_part11 group size is: 502.1 MB
Writing group with size:  502.07694599999996 MB to disk with  12 datasets


 71%|███████   | 180/254 [16:43<25:39, 20.80s/it]

After adding dataset: 49_2017_FRA_LPIS_POLY_110_part12 group size is: 31.6 MB


 71%|███████▏  | 181/254 [16:54<21:38, 17.79s/it]

After adding dataset: 49_2017_FRA_LPIS_POLY_110_part17 group size is: 118.0 MB


 72%|███████▏  | 182/254 [17:03<18:22, 15.31s/it]

After adding dataset: 49_2017_FRA_LPIS_POLY_110_part21 group size is: 153.9 MB


 72%|███████▏  | 183/254 [17:08<14:21, 12.13s/it]

After adding dataset: 49_2018_FRA_LPIS_POLY_110_part10 group size is: 189.6 MB


 72%|███████▏  | 184/254 [17:13<11:36,  9.96s/it]

After adding dataset: 49_2018_FRA_LPIS_POLY_110_part5 group size is: 274.1 MB


 73%|███████▎  | 185/254 [17:19<09:58,  8.67s/it]

After adding dataset: 49_2018_FRA_LPIS_POLY_110_part8 group size is: 355.5 MB


 73%|███████▎  | 186/254 [17:19<06:59,  6.17s/it]

After adding dataset: 49_2018_UKR_NHI-01_POINT_110 group size is: 356.6 MB


 74%|███████▎  | 187/254 [17:25<06:50,  6.12s/it]

After adding dataset: 49_2019_FRA_LPIS_POLY_110_part11 group size is: 440.0 MB
After adding dataset: 49_2019_FRA_LPIS_POLY_110_part14 group size is: 539.7 MB
Writing group with size:  539.717101 MB to disk with  9 datasets


 74%|███████▍  | 189/254 [18:00<11:35, 10.70s/it]

After adding dataset: 49_2019_FRA_LPIS_POLY_110_part25 group size is: 45.3 MB


 75%|███████▍  | 190/254 [18:07<10:04,  9.45s/it]

After adding dataset: 49_2019_FRA_LPIS_POLY_110_part5 group size is: 167.3 MB


 76%|███████▌  | 192/254 [18:10<05:35,  5.42s/it]

After adding dataset: 49_2019_FRA_LPIS_POLY_110_part7 group size is: 205.3 MB
After adding dataset: 49_2019_UKR_NHI-01_POINT_110 group size is: 206.4 MB


 76%|███████▌  | 193/254 [18:13<04:47,  4.72s/it]

After adding dataset: 49_2020_FRA_LPIS_POLY_110_part16 group size is: 271.6 MB


 76%|███████▋  | 194/254 [18:18<04:45,  4.77s/it]

After adding dataset: 49_2020_FRA_LPIS_POLY_110_part19 group size is: 388.3 MB


 77%|███████▋  | 195/254 [18:24<04:50,  4.93s/it]

After adding dataset: 49_2020_FRA_LPIS_POLY_110_part9 group size is: 436.7 MB


 77%|███████▋  | 196/254 [18:28<04:37,  4.79s/it]

After adding dataset: 49_2021_SVK_Eurocrops_POLY_110_part1 group size is: 453.4 MB


 78%|███████▊  | 197/254 [18:35<05:05,  5.36s/it]

After adding dataset: 49_2021_SVK_Eurocrops_POLY_110_part2 group size is: 471.9 MB
After adding dataset: 50_2017_FRA_LPIS_POLY_110_part24 group size is: 510.4 MB
Writing group with size:  510.38604900000007 MB to disk with  10 datasets


 78%|███████▊  | 199/254 [19:13<10:09, 11.09s/it]

After adding dataset: 50_2018_FRA_LPIS_POLY_110_part31 group size is: 72.4 MB


 79%|███████▊  | 200/254 [19:14<07:13,  8.02s/it]

After adding dataset: 50_2018_LUX_LPIS_POLY_110 group size is: 79.6 MB


 79%|███████▉  | 201/254 [19:18<05:52,  6.66s/it]

After adding dataset: 50_2019_FRA_LPIS_POLY_110_part16 group size is: 140.3 MB


 80%|███████▉  | 203/254 [19:19<02:59,  3.53s/it]

After adding dataset: 50_2019_LUX_LPIS_POLY_110 group size is: 147.3 MB
After adding dataset: 50_2019_UKR_JECAM-1_POLY_110 group size is: 147.5 MB


 80%|████████  | 204/254 [19:19<02:07,  2.56s/it]

After adding dataset: 50_2019_UKR_JECAM-2_POLY_100 group size is: 147.5 MB


 81%|████████  | 205/254 [19:20<01:40,  2.06s/it]

After adding dataset: 50_2020_LUX_LPIS_POLY_110 group size is: 154.3 MB


 81%|████████  | 206/254 [19:21<01:22,  1.73s/it]

After adding dataset: 50_2021_LUX_LPIS_POLY_110 group size is: 160.9 MB


 81%|████████▏ | 207/254 [19:30<03:02,  3.87s/it]

After adding dataset: 51_2017_BEL_LPIS-Flanders_POLY_110 group size is: 233.1 MB


 82%|████████▏ | 208/254 [19:38<03:56,  5.14s/it]

After adding dataset: 51_2018_BEL_LPIS-Flanders_POLY_110 group size is: 305.5 MB


 82%|████████▏ | 209/254 [19:46<04:33,  6.07s/it]

After adding dataset: 51_2019_BEL_LPIS-Flanders_POLY_110 group size is: 378.7 MB


 83%|████████▎ | 210/254 [19:52<04:20,  5.91s/it]

After adding dataset: 51_2020_BEL_LPIS-Flanders_POLY_110 group size is: 453.0 MB
After adding dataset: 51_2021_BEL_LPIS-Flanders_POLY_110 group size is: 526.0 MB
Writing group with size:  525.967342 MB to disk with  13 datasets


 83%|████████▎ | 212/254 [20:23<06:42,  9.59s/it]

After adding dataset: 51_2021_EUR_EXTRACROPS_POLY_110 group size is: 4.3 MB


 84%|████████▍ | 213/254 [20:32<06:21,  9.30s/it]

After adding dataset: 51_2022_BEL_LPIS-Flanders_POLY_110 group size is: 77.5 MB


 84%|████████▍ | 214/254 [20:40<05:59,  8.99s/it]

After adding dataset: 52_2018_NLD_LPIS_POLY_110_part1 group size is: 115.3 MB


 85%|████████▍ | 215/254 [20:45<05:05,  7.82s/it]

After adding dataset: 52_2018_NLD_LPIS_POLY_110_part2 group size is: 140.6 MB


 85%|████████▌ | 216/254 [20:52<04:46,  7.54s/it]

After adding dataset: 52_2019_NLD_LPIS_POLY_110_part1 group size is: 172.9 MB


 85%|████████▌ | 217/254 [20:59<04:38,  7.53s/it]

After adding dataset: 52_2019_NLD_LPIS_POLY_110_part2 group size is: 204.2 MB


 86%|████████▌ | 218/254 [21:07<04:30,  7.52s/it]

After adding dataset: 52_2020_NLD_LPIS_POLY_110_part1 group size is: 236.4 MB


 86%|████████▌ | 219/254 [21:13<04:13,  7.23s/it]

After adding dataset: 52_2020_NLD_LPIS_POLY_110_part2 group size is: 268.6 MB


 87%|████████▋ | 220/254 [21:23<04:25,  7.81s/it]

After adding dataset: 52_2021_DEU_Eurocrops-NRW_POLY_110_part1 group size is: 331.4 MB


 87%|████████▋ | 221/254 [21:29<04:02,  7.36s/it]

After adding dataset: 52_2021_DEU_Eurocrops-NRW_POLY_110_part2 group size is: 375.5 MB


 87%|████████▋ | 222/254 [21:36<03:55,  7.36s/it]

After adding dataset: 52_2021_NLD_LPIS_POLY_110_part1 group size is: 407.5 MB


 88%|████████▊ | 223/254 [21:44<03:50,  7.43s/it]

After adding dataset: 52_2021_NLD_LPIS_POLY_110_part2 group size is: 439.4 MB


 88%|████████▊ | 224/254 [21:50<03:27,  6.92s/it]

After adding dataset: 52_2022_NLD_LPIS_POLY_110_part1 group size is: 455.3 MB


 89%|████████▊ | 225/254 [21:55<03:07,  6.46s/it]

After adding dataset: 52_2022_NLD_LPIS_POLY_110_part2 group size is: 471.0 MB


 89%|████████▉ | 226/254 [22:02<03:04,  6.60s/it]

After adding dataset: 52_2022_NLD_LPIS_POLY_110_part3 group size is: 486.5 MB
After adding dataset: 52_2022_NLD_LPIS_POLY_110_part4 group size is: 501.9 MB
Writing group with size:  501.89452700000004 MB to disk with  16 datasets


 90%|████████▉ | 228/254 [22:45<05:47, 13.35s/it]

After adding dataset: 53_2021_DEU_Eurocrops-LS_POLY_110 group size is: 135.7 MB


 90%|█████████ | 229/254 [23:00<05:42, 13.71s/it]

After adding dataset: 55_2021_LTU_Eurocrops_POLY_110_part1 group size is: 205.3 MB


 91%|█████████ | 230/254 [23:07<04:45, 11.88s/it]

After adding dataset: 55_2021_LTU_Eurocrops_POLY_110_part2 group size is: 246.6 MB


 91%|█████████ | 231/254 [23:15<04:05, 10.68s/it]

After adding dataset: 55_2021_LTU_Eurocrops_POLY_110_part3 group size is: 287.1 MB


 91%|█████████▏| 232/254 [23:22<03:29,  9.52s/it]

After adding dataset: 56_2019_DNK_Eurocrops_POLY_110_part1 group size is: 334.4 MB


 92%|█████████▏| 233/254 [23:30<03:12,  9.14s/it]

After adding dataset: 56_2019_DNK_Eurocrops_POLY_110_part2 group size is: 381.6 MB


 92%|█████████▏| 234/254 [23:34<02:30,  7.52s/it]

After adding dataset: 57_2019_LVA_LPIS_POLY_110_part1 group size is: 397.5 MB


 93%|█████████▎| 235/254 [23:37<02:00,  6.36s/it]

After adding dataset: 57_2019_LVA_LPIS_POLY_110_part2 group size is: 414.2 MB


 93%|█████████▎| 236/254 [23:42<01:44,  5.80s/it]

After adding dataset: 57_2019_LVA_LPIS_POLY_110_part3 group size is: 429.9 MB


 93%|█████████▎| 237/254 [23:47<01:33,  5.48s/it]

After adding dataset: 57_2019_LVA_LPIS_POLY_110_part4 group size is: 445.5 MB


 94%|█████████▎| 238/254 [23:51<01:21,  5.08s/it]

After adding dataset: 57_2021_LVA_LPIS_POLY_110_part1 group size is: 457.1 MB


 94%|█████████▍| 239/254 [23:54<01:08,  4.59s/it]

After adding dataset: 57_2021_LVA_LPIS_POLY_110_part2 group size is: 472.3 MB


 94%|█████████▍| 240/254 [23:58<01:00,  4.33s/it]

After adding dataset: 57_2021_LVA_LPIS_POLY_110_part3 group size is: 487.1 MB
After adding dataset: 57_2021_LVA_LPIS_POLY_110_part4 group size is: 502.9 MB
Writing group with size:  502.8587 MB to disk with  14 datasets


 95%|█████████▌| 242/254 [24:52<02:47, 13.99s/it]

After adding dataset: 57_2021_SWE_Eurocrops_POLY_110_part4 group size is: 22.4 MB


 96%|█████████▌| 243/254 [24:56<02:02, 11.13s/it]

After adding dataset: 58_2021_SWE_Eurocrops_POLY_110_part2 group size is: 47.5 MB


 96%|█████████▌| 244/254 [24:59<01:25,  8.56s/it]

After adding dataset: 58_2021_SWE_Eurocrops_POLY_110_part3 group size is: 63.6 MB


 96%|█████████▋| 245/254 [25:04<01:07,  7.45s/it]

After adding dataset: 58_2021_SWE_Eurocrops_POLY_110_part5 group size is: 84.1 MB


 97%|█████████▋| 246/254 [25:08<00:50,  6.36s/it]

After adding dataset: 59_2021_EST_Eurocrops_POLY_110_part1 group size is: 96.6 MB


 97%|█████████▋| 247/254 [25:13<00:42,  6.07s/it]

After adding dataset: 59_2021_EST_Eurocrops_POLY_110_part2 group size is: 110.2 MB


 98%|█████████▊| 248/254 [25:17<00:33,  5.51s/it]

After adding dataset: 59_2021_SWE_Eurocrops_POLY_110_part1 group size is: 126.8 MB


 98%|█████████▊| 249/254 [25:25<00:30,  6.13s/it]

After adding dataset: 62_2020_FIN_LPIS_POLY_110_part1 group size is: 179.7 MB


 98%|█████████▊| 250/254 [25:32<00:26,  6.53s/it]

After adding dataset: 62_2020_FIN_LPIS_POLY_110_part2 group size is: 234.1 MB


 99%|█████████▉| 251/254 [25:38<00:18,  6.23s/it]

After adding dataset: 62_2020_FIN_LPIS_POLY_110_part3 group size is: 268.1 MB


 99%|█████████▉| 252/254 [25:47<00:14,  7.11s/it]

After adding dataset: 62_2021_FIN_LPIS_POLY_110_part1 group size is: 324.8 MB


100%|█████████▉| 253/254 [25:57<00:07,  7.94s/it]

After adding dataset: 62_2021_FIN_LPIS_POLY_110_part2 group size is: 377.6 MB
After adding dataset: 62_2021_FIN_LPIS_POLY_110_part3 group size is: 407.9 MB
Writing group with size:  407.855672 MB to disk with  13 datasets


100%|██████████| 254/254 [26:26<00:00,  6.25s/it]


In [None]:
from tqdm import tqdm
import geopandas as gpd
import pandas as pd
from pathlib import Path

dataset_folder = Path("/vitodata/worldcereal_data/EXTRACTIONS/all_datasets/grouped_datasets/")

datasets = []
for dataset in tqdm(list(dataset_folder.iterdir())):
    print(dataset.stem)
    gdf = gpd.read_parquet(dataset)
    print('dataset read')
    gdf = gdf[gdf.extract == 1]
    datasets.append(gdf)
    del gdf

datasets = pd.concat(datasets, axis=0)
datasets.to_parquet('/data/users/Public/couchard/wc_phase2_samplings.parquet')