In [1]:
import xarray as xr
import pandas as pd
from pathlib import Path

data_dir = "./GFED5/monthly/" 
start_date = "2018-04"
end_date = "2020-12" 
aggregation = "mean"

# Convert dates to datetime for comparison
start_date = pd.to_datetime(start_date, format="%Y-%m")
end_date = pd.to_datetime(end_date, format="%Y-%m")

# List all NetCDF files in the directory

all_files = sorted(Path(data_dir).glob("*.nc"))

# Filter files by date range

filtered_files = [
     str(file) for file in all_files
     if start_date.year <= int(file.stem[-4:]) <= end_date.year
     ]

if not filtered_files:
     raise ValueError("No files match the specified date range.")

print(f"Found {len(filtered_files)} files in the date range.")

Found 3 files in the date range.


In [2]:
import xarray as xr
import pandas as pd
from pathlib import Path

data_dir = "./GFED5/monthly/" 
start_date = pd.to_datetime("2018-04", format="%Y-%m")
end_date = pd.to_datetime("2020-12", format="%Y-%m")
aggregation = "mean"

year_start = start_date.year
year_end = end_date.year


all_files = sorted(Path(data_dir).glob("*.nc"))

# Filter files by date range

filtered_files = [
     str(file) for file in all_files
     if year_start <= int(file.stem[-4:]) <= year_end
     ]

filtered_files

['GFED5/monthly/GFED5_Beta_monthly_2018.nc',
 'GFED5/monthly/GFED5_Beta_monthly_2019.nc',
 'GFED5/monthly/GFED5_Beta_monthly_2020.nc']

In [3]:
# Open the dataset and select only one data variable
var = "C"
ds = xr.open_mfdataset(filtered_files)[var]


# Aggregate the data by the specified method
ds2 = ds.mean(dim="time")
ds2

Unnamed: 0,Array,Chunk
Bytes,3.96 MiB,3.96 MiB
Shape,"(720, 1440)","(720, 1440)"
Dask graph,1 chunks in 11 graph layers,1 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.96 MiB 3.96 MiB Shape (720, 1440) (720, 1440) Dask graph 1 chunks in 11 graph layers Data type float32 numpy.ndarray",1440  720,

Unnamed: 0,Array,Chunk
Bytes,3.96 MiB,3.96 MiB
Shape,"(720, 1440)","(720, 1440)"
Dask graph,1 chunks in 11 graph layers,1 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [4]:
var = "C6H6"
ds = xr.open_mfdataset(filtered_files)[var]

# Aggregate the data by the specified method

ds3 = ds.mean(dim="time")

# compare ds2 and ds3

ds2.equals(ds3)

False

In [20]:
from dask.dataframe import from_pandas


# measure time
import time
# start = time.time()
# data = from_pandas(carbon_mean.to_dataframe().reset_index(), npartitions=1000).compute(schedule="threads")
# end = time.time()
# print("Time taken: ", end-start)

# redo with cudf

import cudf

start = time.time()
data = cudf.DataFrame.from_pandas(carbon_mean.to_dataframe().reset_index())
end = time.time()

# other ideas?
#  - use dask_cudf

print("Time taken: ", end-start)

Time taken:  0.9660913944244385


In [13]:
import dask_cudf

# convert carbon_mean to dask_cudf

carbon_mean_dask = dask_cudf.from_cudf(carbon_mean.to_dataframe().reset_index(), chunksize=10000)

In [None]:
carbon_mean_dask

In [None]:
# Time taken:  21.169448614120483
# Time taken:  18.97800064086914

In [None]:
import pydeck as pdk
# heatmap

layer = pdk.Layer(
     "HeatmapLayer",
     data,
     get_position=["lon", "lat"],
     get_weight="C",
)

view_state = pdk.ViewState(
     longitude=0,
     latitude=0,
     zoom=1,
     min_zoom=0,
     max_zoom=15,
     pitch=40.5,
     bearing=-27.396674584323023,
)

r = pdk.Deck(layers=[layer], initial_view_state=view_state)

r.to_html("heatmap.html", notebook_display=False)

In [None]:
layer = pdk.Layer(
     "HexagonLayer",
     data,
     get_position=["lon", "lat"],
     auto_highlight=True,
     elevation_scale=50,
     pickable=True,
     extruded=True,
     coverage=1,
     radius=1000
)

view_state = pdk.ViewState(
     longitude=0,
     latitude=0,
     zoom=1,
     min_zoom=0,
     max_zoom=15,
     pitch=40.5,
     bearing=-27.396674584323023,
)

r = pdk.Deck(layers=[layer], initial_view_state=view_state)
r.to_html("hexagon.html", notebook_display=False)