In [None]:

import numpy as np
import matplotlib.pyplot as plt
import rasterio
from glob import glob
import pathlib
import boto3
import pandas as pd
import calendar
import seaborn as sns
import json
import re

In [None]:
# Enter the year you want to run validation on
vyear=2022 # summary json files will be later generated for the year you provide here
data_dir="data/" # make sure you have the data for vyear in your data directory

In [None]:
session = boto3.session.Session()
s3_client = session.client("s3")

dataset_name= "odiac-ffco2-monthgrid-v2023"
cog_data_bucket="ghgc-data-store-develop"
cog_data_prefix = f"transformed_cogs/{dataset_name}"

In [None]:
def get_all_s3_keys(bucket, model_name, ext):
    """Get a list of all keys in an S3 bucket."""
    keys = []

    kwargs = {"Bucket": bucket, "Prefix": f"{model_name}/"}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp["Contents"]:
            if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
                keys.append(obj["Key"])

        try:
            kwargs["ContinuationToken"] = resp["NextContinuationToken"]
        except KeyError:
            break

    return keys

keys = get_all_s3_keys(cog_data_bucket, cog_data_prefix, ".tif")

# Extract only the COGs for selected year
pattern = re.compile(rf'{vyear}(0[1-9]|1[0-2])')
keys = [path for path in keys if pattern.search(path)]

In [None]:
# Initialize the summary variables
summary_dict_netcdf, summary_dict_cog = {}, {}
overall_stats_netcdf, overall_stats_cog = {}, {}
full_data_df_netcdf, full_data_df_cog = pd.DataFrame(), pd.DataFrame()

In [None]:
# Process the COGs to get the statistics
for key in keys:
    url=f"s3://{cog_data_bucket}/{key}"
    with rasterio.open(url) as src:
        filename_elements = re.split("[_ ? . ]", url)
        for band in src.indexes:
            print("_".join(filename_elements[1:6]))
            idx = pd.MultiIndex.from_product(
                    [
                        ["_".join(filename_elements[1:6])],
                        [filename_elements[5]],
                        [x for x in np.arange(1, src.height + 1)],
                    ]
                )
            raster_data = src.read(band)
            raster_data[raster_data == -9999] = 0 # because we did that in the transformation script
            temp = pd.DataFrame(index=idx, data=raster_data)
            full_data_df_cog = full_data_df_cog._append(temp, ignore_index=False)

            # Calculate summary statistics
            min_value = np.float64(temp.values.min())
            max_value = np.float64(temp.values.max())
            mean_value = np.float64(temp.values.mean())
            std_value = np.float64(temp.values.std())

            summary_dict_cog[
                    f'{"_".join(filename_elements[1:5])}_{filename_elements[5][:4]}_{calendar.month_name[int(filename_elements[5][4:])]}'
                ] = {
                    "min_value": min_value,
                    "max_value": max_value,
                    "mean_value": mean_value,
                    "std_value": std_value,
                }


In [None]:
# Process the raw files for selected year to get the statistics 
tif_files = glob(f"{data_dir}{vyear}/*.tif", recursive=True)
for tif_file in tif_files:
    file_name = pathlib.Path(tif_file).name[:-4]
    print(file_name)
    with rasterio.open(tif_file) as src:
        for band in src.indexes:
            idx = pd.MultiIndex.from_product(
                [
                    [pathlib.Path(tif_file).name[:-9]],
                    [pathlib.Path(tif_file).name[-8:-4]],
                    [x for x in np.arange(1, src.height + 1)],
                ]
            )
            # Read the raster data
            raster_data = src.read(band)
            #raster_data[raster_data == -9999] = np.nan
            temp = pd.DataFrame(index=idx, data=raster_data)
            full_data_df_netcdf = full_data_df_netcdf._append(temp, ignore_index=False)

            # Calculate summary statistics
            min_value = np.float64(temp.values.min())
            max_value = np.float64(temp.values.max())
            mean_value = np.float64(temp.values.mean())
            std_value = np.float64(temp.values.std())

            summary_dict_netcdf[
                f'{tif_file.split("/")[-1][:-9]}_{calendar.month_name[int(tif_file.split("/")[-1][-6:-4])]}'
            ] = {
                "min_value": min_value,
                "max_value": max_value,
                "mean_value": mean_value,
                "std_value": std_value,
            }
            

In [None]:
# Merge monthly stats for COGs and raw files in a csv file 
cog_df = pd.DataFrame(summary_dict_cog).T.reset_index()
raw_df = pd.DataFrame(summary_dict_netcdf).T.reset_index()
cog_df['date']= cog_df["index"].apply(lambda x: (x.split("_")[-1]+x.split("_")[-2]) )
raw_df['date']= raw_df["index"].apply(lambda x: (x.split("_")[-1]+str(vyear)) )
check_df=pd.merge(cog_df, raw_df[["min_value","max_value","mean_value","std_value","date"]], how='inner', on='date',suffixes=('', '_raw'))
check_df.to_csv(f"monthly_stats_{vyear}.csv")

In [None]:
# Calculate the overall data stat for that year
overall_stats_netcdf["min_value"] = np.float64(full_data_df_netcdf.values.min())
overall_stats_netcdf["max_value"] = np.float64(full_data_df_netcdf.values.max())
overall_stats_netcdf["mean_value"] = np.float64(full_data_df_netcdf.values.mean())
overall_stats_netcdf["std_value"] = np.float64(full_data_df_netcdf.values.std())

overall_stats_cog["min_value"] = np.float64(full_data_df_cog.values.min())
overall_stats_cog["max_value"] = np.float64(full_data_df_cog.values.max())
overall_stats_cog["mean_value"] = np.float64(full_data_df_cog.values.mean())
overall_stats_cog["std_value"] = np.float64(full_data_df_cog.values.std())

In [None]:

data = {
    "Stats for raw netCDF files.": summary_dict_netcdf,
    "Stats for transformed COG files.": summary_dict_cog
}

# Writing to JSON file
with open(f"monthly_stats_{vyear}.json", "w") as fp:
    json.dump(data, fp, indent=4) 

data = {
    "Stats for raw netCDF files.": overall_stats_netcdf,
    "Stats for transformed COG files.": overall_stats_cog
}

# Writing to JSON file
with open(f"overall_stats_{vyear}.json", "w") as fp:
    json.dump(data, fp, indent=4) 