In [None]:
import pandas as pd

dataframes_df = pd.read_excel('../.noteook-tests/Overview-reference-data-phase-1-20231222.xlsx', sheet_name=None)
dataframes_df = dataframes_df['Overview']
dataframes_df

In [None]:
# Datasets that are ready to be extracted and uploaded to RDM
to_process_dfs = dataframes_df[dataframes_df['Geoparquet Status'] == 'ready']['Name_Ewoc_II'].tolist()
len(to_process_dfs)

In [1]:
# We need to treat group0 separately, as extractions for it have already been done
# here, we get all datasets that are part of group0
import geopandas as gpd

group_0_path = "/vitodata/worldcereal_data/EXTRACTIONS/all_datasets/grouped_datasets/group_0.geoparquet"
group0_df = gpd.read_parquet(group_0_path)
group0_datasets = group0_df['ref_id'].unique()

In [2]:
# Redirected getting the datasets from the RDM folder instead of the excel file

import glob
from pathlib import Path

dataset_paths = []
for dataset in to_process_dfs:
    part_dirs = glob.glob(str(rdm_dir / dataset / 'harmonized' / '*'))
    part_dirs = [xx for xx in part_dirs if Path(xx).is_dir()]

    if len(part_dirs) == 0:
        try:
            dataset_paths.append(glob.glob(str(rdm_dir / dataset / 'harmonized' / '*.geoparquet'))[0])
        except:
            continue
    else:
        dataset_paths.extend(glob.glob(str(rdm_dir / dataset / 'harmonized' / '*' / '*.geoparquet')))

# exclude group0 datasets
dataset_paths = [xx for xx in dataset_paths if Path(xx).stem not in group0_datasets]
to_process_dfs = [xx.split('/')[-1].split('.')[0] for xx in dataset_paths]

In [3]:
from tqdm import tqdm
import gc

import warnings
warnings.simplefilter('ignore')

# is this really needed as a separate run?

# geometry_stats = {}
# for df_name, path in tqdm(zip(to_process_dfs, dataset_paths), total=len(to_process_dfs)):
#     df = gpd.read_parquet(path)
    
#     df["geometry"] = df.geometry.centroid
#     median_latitude = abs(df['geometry'].y).median()    
#     geometry_stats[df_name] = median_latitude

#     del df
#     gc.collect()

# geometry_stats


In [4]:
# Save the reference dataset to a new parquet file
dest_folder = Path('/vitodata/worldcereal_data/EXTRACTIONS/all_datasets/all_datasets_updated_flag/')

def clear_write_dataset(ref_df, current_dataset_name, geometry_stats, dest_folder):
    # added to handle the case where the dataset is a part of a larger dataset
    parted_dataset_name = current_dataset_name
    if "_part" in current_dataset_name:
        current_dataset_name = current_dataset_name.split("_part")[0]
    ref_df['ref_id'] = current_dataset_name

    if " h3_l3_cell" in ref_df.columns:
        print("Rename column h3_l3_cell for dataset: ", current_dataset_name)
        ref_df = ref_df.rename(columns={" h3_l3_cell": "h3_l3_cell"})

    required_columns_and_types = [
        ('ref_id', 'str'),
        ('sample_id', 'str'),
        ('h3_l3_cell', 'str'),
        ('valid_time', 'str'),
        ('extract', 'int32'),
        ('ewoc_code', 'int64')
    ]

    for column, dtype in required_columns_and_types:
        if column not in ref_df.columns:
            raise ValueError(f'Column {column} not found in {current_dataset_name}')
        if ref_df[column].dtype != dtype:
            # since pandas does not have a dtype str, we check for object
            if (dtype == "str") and (ref_df[column].dtype != "O"):
                warnings.warn(f'Column {column} has dtype {ref_df[column].dtype} but should be {dtype}')
                try:
                    ref_df[column] = ref_df[column].astype(dtype)
                except:
                    print(f"Exception when trying to convert column {column} from dataset {current_dataset_name} to {dtype}")
                    return

    # moved getting median latitude to here instead of in the seperate run above
    median_latitude = str(int(round(abs(ref_df.geometry.centroid.y).median()))).zfill(2)

    dest_file = dest_folder / f'{median_latitude}_{parted_dataset_name}.geoparquet'
    ref_df.to_parquet(dest_file, index=False)

In [None]:
for current_dataset_path, current_dataset_name in tqdm(zip(dataset_paths, to_process_dfs), total=len(dataset_paths)):
    if not Path(current_dataset_path).exists():
        raise FileNotFoundError(f'Couldn\'t find {current_dataset_name} in path: {current_dataset_path}')
    try:
        ref_df = gpd.read_parquet(current_dataset_path)
    except Exception:
        raise IOError(f'Failed to read {current_dataset_path}')

    clear_write_dataset(ref_df, current_dataset_name, geometry_stats, dest_folder)

    del ref_df
    gc.collect()