# GRUANpy Proof of Concept

GRUANpy is a python toolkit to facilitate the analysis of GRUAN data. It includes several functionalities and is easily extensible thanks to its structure based on the inclusion of different data models and helper classes that provide specialized methods for different types of purposes. The different functions can also be executed in succession in order to create real data pipelines that allow a large variety of outputs.

Here is a list of the main features identified so far:
- download, the ability to download data of interest
- merge, the ability to merge data from different products
- aggregation, the ability to aggregate different observations and lower the resolution of the data (respecting the GRUAN principles in uncertainty processing)

In [1]:
# Demo setup
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from gruanpy import GRUANpy
from datetime import datetime
download_folder=r"gdp_demo_090625"
gp = GRUANpy(download_folder=download_folder)
%matplotlib qt


## DOWNLOAD

In [2]:
# Download Gruan Data Product (GDP) through NOAA FTP
ftp_dir_path=r'pub/data/gruan/processing/level2/RS92-GDP/version-002/LIN/2018'
files=gp.search(ftp_dir_path=ftp_dir_path)
print(f"Found {len(files)} files in {ftp_dir_path}")
for file in files[:5]:
    print(file)
print("-----"*10)
print("Downloading first 2 files for demo purpose")
files=files[:2]
for file in files:
    gp.download(ftp_dir_path=ftp_dir_path, file_name=file)
print("Download completed.")
print("-----"*10)
for file in files:
    gdp=gp.read(download_folder+r'/ '[:-1]+file)
    print(gdp.global_attrs.head())
    print("-----"*10)

# Iterative script at code_examples\download_gdp.py

Found 75 files in pub/data/gruan/processing/level2/RS92-GDP/version-002/LIN/2018
LIN-RS-01_2_RS92-GDP_002_20180611T180000_1-002-001.nc
LIN-RS-01_2_RS92-GDP_002_20180103T000000_1-002-002.nc
LIN-RS-01_2_RS92-GDP_002_20180612T002400_1-000-002.nc
LIN-RS-01_2_RS92-GDP_002_20180122T120000_1-002-001.nc
LIN-RS-01_2_RS92-GDP_002_20180613T180000_1-002-002.nc
--------------------------------------------------
Downloading first 2 files for demo purpose
Download completed.
--------------------------------------------------
     Attribute                                              Value
0  Conventions                                             CF-1.4
1        title                RS92 GRUAN Data Product (Version 2)
2  institution  MOL - Lindenberg Meteorological Observatory; D...
3       source                                           RS92-SGP
4      history  2018-06-11 21:30:51.000Z RS92-GDP: RS92 GRUAN ...
--------------------------------------------------
     Attribute                       

In [3]:
# Download GRUAN DATA through CDS API in csv format

api_response_file = "api_response.csv"
api_request = """
import cdsapi

dataset = "insitu-observations-gruan-reference-network"
request = {
    "variable": [
        "air_temperature",
        "relative_humidity",
        "relative_humidity_effective_vertical_resolution",
        "wind_speed",
        "wind_from_direction",
        "eastward_wind_speed",
        "northward_wind_speed",
        "shortwave_radiation",
        "air_pressure",
        "altitude",
        "geopotential_height",
        "frost_point_temperature",
        "water_vapour_volume_mixing_ratio",
        "vertical_speed_of_radiosonde",
        "time_since_launch"
    ],
    "year": "2014",
    "month": "10",
    "day": ["14"],
    "data_format": "csv",
    "area": [90, 0, 0, 90]
}

client = cdsapi.Client()
"""+f"""
target= r"{download_folder}\\{api_response_file}" # Change this to your desired output path
client.retrieve(dataset, request, target)

"""
print("Executing API request ...")
gp.exec_request(api_request)
print("-----"*10)


Executing API request ...


2025-06-09 09:40:04,409 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-06-09 09:40:04,826 INFO Request ID is c8fa3021-8d00-4a0c-96ea-f3bb487f7ab7
2025-06-09 09:40:04,933 INFO status has been updated to accepted
2025-06-09 09:40:19,556 INFO status has been updated to running
2025-06-09 09:41:21,705 INFO status has been updated to successful
                                                                                          

--------------------------------------------------




In [4]:
# Download GRUAN DATA through CDS API in netCDF format

api_response_file = "api_response.nc"
api_request = """
import cdsapi

dataset = "insitu-observations-gruan-reference-network"
request = {
    "variable": [
        "air_temperature",
        "relative_humidity",
        "relative_humidity_effective_vertical_resolution",
        "wind_speed",
        "wind_from_direction",
        "eastward_wind_speed",
        "northward_wind_speed",
        "shortwave_radiation",
        "air_pressure",
        "altitude",
        "geopotential_height",
        "frost_point_temperature",
        "water_vapour_volume_mixing_ratio",
        "vertical_speed_of_radiosonde",
        "time_since_launch"
    ],
    "year": "2014",
    "month": "10",
    "day": ["14"],
    "data_format": "netcdf",
    "area": [90, 0, 0, 90]
}

client = cdsapi.Client()
"""+f"""
target= r"{download_folder}\\{api_response_file}" # Change this to your desired output path
client.retrieve(dataset, request, target)

"""
print("Executing API request ...")
gp.exec_request(api_request)
print("-----"*10)

2025-06-09 09:42:19,946 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.


Executing API request ...


2025-06-09 09:42:20,308 INFO Request ID is d183ea59-8790-4387-a504-72fdcf6f9632
2025-06-09 09:42:20,513 INFO status has been updated to accepted
2025-06-09 09:42:42,525 INFO status has been updated to running
2025-06-09 09:43:11,198 INFO status has been updated to successful
                                                                                         

--------------------------------------------------




In [6]:
import cdsapi

dataset = "insitu-observations-gruan-reference-network"
request = {
    "variable": [
        "air_temperature",
        "relative_humidity",
        "relative_humidity_effective_vertical_resolution",
        "wind_speed",
        "wind_from_direction",
        "eastward_wind_speed",
        "northward_wind_speed",
        "shortwave_radiation",
        "air_pressure",
        "altitude",
        "geopotential_height",
        "frost_point_temperature",
        "time_since_launch",
        "vertical_speed_of_radiosonde",
        "water_vapour_volume_mixing_ratio"
    ],
    "year": "2020",
    "month": "01",
    "day": [
        "03", "10", "15",
        "17", "22", "23",
        "24", "28", "29",
        "31"
    ],
    "data_format": "netcdf"
}

client = cdsapi.Client()
client.retrieve(dataset, request).download()


2025-06-09 14:40:19,478 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-06-09 14:40:19,793 INFO Request ID is 2e65ec27-4bc5-41f6-8038-d09b6dc74491
2025-06-09 14:40:19,885 INFO status has been updated to accepted
2025-06-09 14:40:30,056 INFO status has been updated to running
2025-06-09 14:40:42,842 INFO status has been updated to successful
                                                                                         

'eb554bddba46166f9583f056b31a794b.nc'

# MERGE and AGGREGATE

In [10]:
# GDP RS41 spatial gridding
file_path = r'C:\Users\tomma\Documents\SDC\Repos\GRUAN_EDA\gdp\data_examples\LIN-RS-01_2_RS41-GDP_001_20141209T120000_1-009-002.nc'
gdp=gp.read(file_path)
bin_column = 'press'
target_columns = ['temp', 'rh']
ggd = gp.spatial_gridding(gdp, bin_column, target_columns)

# Plot original and gridded data
for column in target_columns:
    fig, ax1 = plt.subplots(figsize=(7, 6))
    if bin_column == 'press':
        ax1.set_yscale('log')
        ax1.invert_yaxis()
    ax1.scatter(gdp.data[column], gdp.data[bin_column], label='Original Data', alpha=0.5)
    ax1.scatter(ggd.data[column], ggd.data[bin_column], label='Gridded Data', color='red', alpha=0.5)
    ax1.fill_betweenx(gdp.data[bin_column], gdp.data[column] - gdp.data[column+'_uc'], gdp.data[column] + gdp.data[column+'_uc'], color='blue', alpha=0.2, label='Original Uncertainty')
    ax1.fill_betweenx(ggd.data[bin_column], ggd.data[column] - ggd.data[column+'_uc'], ggd.data[column] + ggd.data[column+'_uc'], color='red', alpha=0.2, label='Gridded Uncertainty')
    ax1.set_xlabel(f'{column.capitalize()} {gdp.variables_attrs[gdp.variables_attrs['variable'] == column]['units'].values[0]}')
    ax1.set_ylabel(f'{bin_column.capitalize()} {gdp.variables_attrs[gdp.variables_attrs['variable'] == bin_column]['units'].values[0]}')
    ax1.legend()
    long_name= gdp.variables_attrs[gdp.variables_attrs['variable'] == column]['long_name'].values[0]
    ax1.set_title(f'{long_name} Spatial Gridding at Mandatory Levels')
    plt.tight_layout()
    plt.show()

In [2]:
# GDP RS41 temporal gridding

# Read multiple GDP files
gdp_folder=r'C:\Users\tomma\Documents\SDC\Repos\GRUAN_EDA\gdp\products_RS41-GDP-1_LIN_2017'
gdp_files = [os.path.join(gdp_folder, f) for f in os.listdir(gdp_folder) if f.endswith('.nc')]
gdps=[]
for file in tqdm(gdp_files[:500] , desc="Reading GDPs"):
    gdps.append(gp.read(file))

# Uniform Spatial gridding accross multiple GDPs
ggds=[]
target_columns = ['temp', 'rh']
for gdp in tqdm(gdps, desc="Spatial Gridding"):
    ggd = gp.spatial_gridding(gdp, 'press', target_columns, mandatory_levels_flag=True)
    ggds.append(ggd)

Reading GDPs:   0%|          | 0/500 [00:00<?, ?it/s]

Reading GDPs: 100%|██████████| 500/500 [25:02<00:00,  3.01s/it]
  lambda x: (((x-x.mean())**2).sum()/(len(x)*(len(x)-1)))**0.5
  lambda x: (((x-x.mean())**2).sum()/(len(x)*(len(x)-1)))**0.5
Spatial Gridding: 100%|██████████| 500/500 [00:15<00:00, 32.37it/s]


In [11]:
# Merge all gridded data
mggd=pd.DataFrame()
for ggd in tqdm(ggds, desc="Merging Gridded Data"):
    start_time_str = ggd.metadata[ggd.metadata['Attribute'] == 'g.Measurement.StartTime']['Value'].values[0]
    start_time = datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S.%fZ")
    ggd_data = ggd.data.copy()
    ggd_data['time'] = start_time
    mggd = pd.concat([mggd, ggd_data], ignore_index=True)

# Plotting the merged gridded data for one mandatory level
for column in target_columns:
    mand_lvl_val = 1000
    mggd_at_lvl = mggd[mggd['mand_lvl'] == mand_lvl_val]
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(mggd_at_lvl['time'], mggd_at_lvl[column], marker='o', linestyle='-', label=f'{column}')
    ax.fill_between(mggd_at_lvl['time'], mggd_at_lvl[column] - mggd_at_lvl[column + '_uc'], mggd_at_lvl[column] + mggd_at_lvl[column + '_uc'], alpha=0.2, label='Uncertainty')
    ax.set_xlabel('Time')
    ax.set_ylabel(f'{column.capitalize()} {gdp.variables_attrs[gdp.variables_attrs['variable'] == column]['units'].values[0]}')
    ax.set_title(f'{column.capitalize()} at Mandatory Level {mand_lvl_val} Over Time')
    ax.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Merging Gridded Data: 100%|██████████| 500/500 [00:01<00:00, 468.72it/s]


In [12]:
# Temporal gridding from scratch DAY

bin_size = 21 # Size of the time bin in days
first_date = mggd['time'].min()
mggd['day_diff'] = mggd['time'].apply(lambda x: (x-first_date).days) # distance in days
mggd['time_bin'] = (mggd['day_diff'] // bin_size) * bin_size + bin_size / 2
mggd['hour'] = mggd['time'].dt.hour
dmggd = mggd[(mggd['hour'] >= 6) & (mggd['hour'] < 18)] #day merged gridded data

tggd = dmggd.groupby(['mand_lvl', 'time_bin'])[target_columns].mean().reset_index() # 3.12
tggd['time'] = pd.to_datetime(tggd['time_bin'], unit='D', origin=first_date)
for col in target_columns:
    tggd[col + '_uc_ucor_avg'] = dmggd.groupby(['time_bin','mand_lvl'])[col + '_uc_ucor'].apply(
                lambda x: (((x**2).sum())**0.5)/len(x)
                ).reset_index(drop=True) #3.13
    tggd[col + '_var'] = dmggd.groupby(['time_bin','mand_lvl'])[col].apply(
                lambda x: ((((x-x.mean())**2).sum())/(len(x)*max((len(x)-1),1)))**0.5
                ).reset_index(drop=True) #3.14
    tggd[col + '_uc_sc']=dmggd.groupby(['time_bin','mand_lvl'])[col + '_uc_scor'].apply(
                lambda x: (((x**2).sum())**0.5)/len(x)
                ).reset_index(drop=True) #3.15
    tggd[col + '_uc_ucor']=(
        tggd[col+'_uc_ucor_avg']**2 + tggd[col + '_var']**2 + tggd[col + '_uc_sc']**2)**0.5 #3.16
    tggd[col + '_cor']=dmggd.groupby(['time_bin','mand_lvl'])[col + '_uc_tcor'].mean().reset_index(drop=True) #3.17
    tggd[col+'_uc']=(
        tggd[col+'_uc_ucor']**2 + tggd[col+'_cor']**2)**0.5 #3.18
    
# Plotting the temporal gridded data
for column in target_columns:
    mand_lvl_val = 1000
    tggd_at_lvl = tggd[tggd['mand_lvl'] == mand_lvl_val]
    dmggd_at_lvl = dmggd[dmggd['mand_lvl'] == mand_lvl_val]
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(dmggd_at_lvl['time'], dmggd_at_lvl[column], marker='x', linestyle='--', label=f'{column} (Original Day)')
    ax.fill_between(dmggd_at_lvl['time'], dmggd_at_lvl[column] - dmggd_at_lvl[column + '_uc'], dmggd_at_lvl[column] + dmggd_at_lvl[column + '_uc'], alpha=0.1, label='Original Day Uncertainty')
    ax.plot(tggd_at_lvl['time'], tggd_at_lvl[column], marker='o', linestyle='-', label=f'{column} (Temporal Gridded)')
    ax.fill_between(tggd_at_lvl['time'], tggd_at_lvl[column] - tggd_at_lvl[column + '_uc'], tggd_at_lvl[column] + tggd_at_lvl[column + '_uc'], alpha=0.2, label='Uncertainty')
    ax.set_xlabel('Time')
    ax.set_ylabel(f'{column.capitalize()} {gdp.variables_attrs[gdp.variables_attrs['variable'] == column]['units'].values[0]}')
    ax.set_title(f'{column.capitalize()} at Mandatory Level {mand_lvl_val} Over Time (Temporal Gridded) at Day')
    ax.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [13]:
# Temporal gridding from scratch NIGHT

bin_size = 21 # Size of the time bin in days
first_date = mggd['time'].min()
mggd['day_diff'] = mggd['time'].apply(lambda x: (x-first_date).days) # distance in days
mggd['time_bin'] = (mggd['day_diff'] // bin_size) * bin_size + bin_size / 2
mggd['hour'] = mggd['time'].dt.hour
dmggd = mggd[(mggd['hour'] < 6) | (mggd['hour'] >= 18)] #day merged gridded data

tggd = dmggd.groupby(['mand_lvl', 'time_bin'])[target_columns].mean().reset_index() # 3.12
tggd['time'] = pd.to_datetime(tggd['time_bin'], unit='D', origin=first_date)
for col in target_columns:
    tggd[col + '_uc_ucor_avg'] = dmggd.groupby(['time_bin','mand_lvl'])[col + '_uc_ucor'].apply(
                lambda x: (((x**2).sum())**0.5)/len(x)
                ).reset_index(drop=True) #3.13
    tggd[col + '_var'] = dmggd.groupby(['time_bin','mand_lvl'])[col].apply(
                lambda x: ((((x-x.mean())**2).sum())/(len(x)*max((len(x)-1),1)))**0.5
                ).reset_index(drop=True) #3.14
    tggd[col + '_uc_sc']=dmggd.groupby(['time_bin','mand_lvl'])[col + '_uc_scor'].apply(
                lambda x: (((x**2).sum())**0.5)/len(x)
                ).reset_index(drop=True) #3.15
    tggd[col + '_uc_ucor']=(
        tggd[col+'_uc_ucor_avg']**2 + tggd[col + '_var']**2 + tggd[col + '_uc_sc']**2)**0.5 #3.16
    tggd[col + '_cor']=dmggd.groupby(['time_bin','mand_lvl'])[col + '_uc_tcor'].mean().reset_index(drop=True) #3.17
    tggd[col+'_uc']=(
        tggd[col+'_uc_ucor']**2 + tggd[col+'_cor']**2)**0.5 #3.18
    
# Plotting the temporal gridded data
for column in target_columns:
    mand_lvl_val = 1000
    tggd_at_lvl = tggd[tggd['mand_lvl'] == mand_lvl_val]
    dmggd_at_lvl = dmggd[dmggd['mand_lvl'] == mand_lvl_val]
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(dmggd_at_lvl['time'], dmggd_at_lvl[column], marker='x', linestyle='--', label=f'{column} (Original Day)')
    ax.fill_between(dmggd_at_lvl['time'], dmggd_at_lvl[column] - dmggd_at_lvl[column + '_uc'], dmggd_at_lvl[column] + dmggd_at_lvl[column + '_uc'], alpha=0.1, label='Original Day Uncertainty')
    ax.plot(tggd_at_lvl['time'], tggd_at_lvl[column], marker='o', linestyle='-', label=f'{column} (Temporal Gridded)')
    ax.fill_between(tggd_at_lvl['time'], tggd_at_lvl[column] - tggd_at_lvl[column + '_uc'], tggd_at_lvl[column] + tggd_at_lvl[column + '_uc'], alpha=0.2, label='Uncertainty')
    ax.set_xlabel('Time')
    ax.set_ylabel(column.capitalize())
    ax.set_title(f'{column.capitalize()} at Mandatory Level {mand_lvl_val} Over Time (Temporal Gridded) at Night')
    ax.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()