In [1]:
import re
from pathlib import Path

import numpy as np
import rasterio as rio
from psycopg import connect
from tqdm.notebook import tqdm

In [2]:
def load_env(env_path: str) -> dict:
    with open(env_path) as f:
        env = {}
        for line in f:
            if line.startswith("#"):
                continue
            env_key, _val = line.split("=", 1)
            env_value = _val.split("\n")[0]
            env[env_key] = env_value
    return env


env_file = ".env"
env = load_env(env_file)

In [3]:
# check if earthstat rasters are properly rounded
earthstat_base_path = Path("../../h3_data_importer/data/earthstat/")
with rio.open(earthstat_base_path / "earthstat2000_global_prod/earthstat2000_global_abaca_production.tif") as r:
    arr = r.read(1)

In [4]:
print(np.max(arr))
print(np.min(arr[arr > 0]))

80.596924
1.04898646e-07


In [5]:
with rio.open(earthstat_base_path / "raw/HarvestedAreaYield175Crops_Geotiff/GeoTiff/abaca/abaca_Production.tif") as r:
    arr_no_round = r.read(1)

In [6]:
print(np.max(arr_no_round))
print(np.min(arr_no_round[arr_no_round > 0]))

80.596924
1.04898646e-07


In [7]:
np.testing.assert_almost_equal(np.round(arr_no_round, 4), arr)

AssertionError: 
Arrays are not almost equal to 7 decimals

Mismatched elements: 117107 / 9331200 (1.26%)
Max absolute difference: 5.340576e-05
Max relative difference: 1.
 x: array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],...
 y: array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],...

### Check that all earthstat rasters are rounded

In [74]:
earthstat_rounded_rasters = sorted(list((earthstat_base_path / "earthstat2000_global_prod").glob("*.tif")))
earthstat_unprocessed_base = earthstat_base_path / "raw/HarvestedAreaYield175Crops_Geotiff/GeoTiff"

In [75]:
earthstat_unprocessed_rasters = []
for p in sorted([x for x in earthstat_unprocessed_base.iterdir() if x.is_dir()]):
    earthstat_unprocessed_rasters.extend(p.glob("*_Production.tif"))

In [80]:
earthstat_rounded_rasters[0]

PosixPath('../../h3_data_importer/data/earthstat/earthstat2000_global_prod/earthstat2000_global_abaca_production.tif')

In [77]:
len(earthstat_unprocessed_rasters)

172

In [78]:
len(earthstat_rounded_rasters)

182

lengths don't match :(, let's check who is out

In [94]:
unprocessed_names = [x.name for x in earthstat_unprocessed_base.iterdir() if x.is_dir()]
len(unprocessed_names)

172

In [114]:
patt = r"earthstat2000\_global\_(.*)\_production"
rounded_names = [re.match(patt, x.name).groups()[0] for x in earthstat_rounded_rasters]

In [115]:
len(rounded_names)

182

In [148]:
rounded_names_to_remove = np.array([n for n in rounded_names if n not in unprocessed_names])

In [119]:
rounded_names = np.array(rounded_names)
rounded_names.sort()

In [134]:
earthstat_rounded_rasters = np.array(earthstat_rounded_rasters)
earthstat_rounded_rasters.sort()
earthstat_rounded_rasters_clean = earthstat_rounded_rasters[~np.isin(rounded_names, rounded_names_to_remove)]
earthstat_rounded_rasters_clean.shape

(172,)

In [146]:
pbar = tqdm(
    zip(sorted(earthstat_rounded_rasters_clean.tolist()), sorted(earthstat_unprocessed_rasters)),
    total=len(earthstat_unprocessed_rasters),
)

for rounded, not_rounded in pbar:
    pbar.set_description(f"Comparing {rounded.name} and {not_rounded.name}")
    with rio.open(rounded) as r_rounded:
        arr_rounded = r_rounded.read()
        # convert nans to 0s
        arr_rounded = np.where(np.isnan(arr_rounded), 0, arr_rounded)
        with rio.open(not_rounded) as r_not_rounded:
            arr_not_rounded = r_not_rounded.read()
            # convert nans to 0s
            arr_not_rounded = np.where(np.isnan(arr_not_rounded), 0, arr_not_rounded)
            np.testing.assert_almost_equal(np.round(arr_not_rounded, 4), arr_rounded, decimal=4)

  0%|          | 0/172 [00:00<?, ?it/s]

AssertionError: 
Arrays are not almost equal to 4 decimals

Mismatched elements: 78 / 9331200 (0.000836%)
Max absolute difference: 1.8796
Max relative difference: 0.
 x: array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],...
 y: array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],...