# GRUANpy Proof of Concept

GRUANpy is a python toolkit to facilitate the analysis of GRUAN data. It includes several functionalities and is easily extensible thanks to its structure based on the inclusion of different data models and helper classes that provide specialized methods for different types of purposes. The different functions can also be executed in succession in order to create real data pipelines that allow a large variety of outputs.

Here is a list of the main features identified so far:
- download, the ability to download data of interest
- merge, the ability to merge data from different products
- aggregation, the ability to aggregate different observations and lower the resolution of the data (respecting the GRUAN principles in uncertainty processing)

In [4]:
# Demo setup
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from gruanpy import GRUANpy
from datetime import datetime
download_folder=r"gdp_demo_090625"
gp = GRUANpy(download_folder=download_folder)
%matplotlib qt


## DOWNLOAD

In [None]:
# Download Gruan Data Product (GDP) through NOAA FTP
ftp_dir_path=r'pub/data/gruan/processing/level2/RS92-GDP/version-002/LIN/2018'
files=gp.search(ftp_dir_path=ftp_dir_path)
print(f"Found {len(files)} files in {ftp_dir_path}")
for file in files[:5]:
    print(file)
print("-----"*10)
print("Downloading first 2 files for demo purpose")
files=files[:2]
for file in files:
    gp.download(ftp_dir_path=ftp_dir_path, file_name=file)
print("Download completed.")
print("-----"*10)
for file in files:
    gdp=gp.read(download_folder+r'/ '[:-1]+file)
    print(gdp.global_attrs.head())
    print("-----"*10)

# Iterative script at code_examples\download_gdp.py

Found 75 files in pub/data/gruan/processing/level2/RS92-GDP/version-002/LIN/2018
LIN-RS-01_2_RS92-GDP_002_20180611T180000_1-002-001.nc
LIN-RS-01_2_RS92-GDP_002_20180103T000000_1-002-002.nc
LIN-RS-01_2_RS92-GDP_002_20180612T002400_1-000-002.nc
LIN-RS-01_2_RS92-GDP_002_20180122T120000_1-002-001.nc
LIN-RS-01_2_RS92-GDP_002_20180613T180000_1-002-002.nc
--------------------------------------------------
Downloading first 2 files for demo purpose
Download completed.
--------------------------------------------------
     Attribute                                              Value
0  Conventions                                             CF-1.4
1        title                RS92 GRUAN Data Product (Version 2)
2  institution  MOL - Lindenberg Meteorological Observatory; D...
3       source                                           RS92-SGP
4      history  2018-06-11 21:30:51.000Z RS92-GDP: RS92 GRUAN ...
--------------------------------------------------
     Attribute                       

In [None]:
# Download GRUAN DATA through CDS API in csv format

api_response_file = "api_response.csv"
api_request = """
import cdsapi

dataset = "insitu-observations-gruan-reference-network"
request = {
    "variable": [
        "air_temperature",
        "relative_humidity",
        "relative_humidity_effective_vertical_resolution",
        "wind_speed",
        "wind_from_direction",
        "eastward_wind_speed",
        "northward_wind_speed",
        "shortwave_radiation",
        "air_pressure",
        "altitude",
        "geopotential_height",
        "frost_point_temperature",
        "water_vapour_volume_mixing_ratio",
        "vertical_speed_of_radiosonde",
        "time_since_launch"
    ],
    "year": "2014",
    "month": "10",
    "day": ["14"],
    "data_format": "csv",
    "area": [90, 0, 0, 90]
}

client = cdsapi.Client()
"""+f"""
target= r"{download_folder}\\{api_response_file}" # Change this to your desired output path
client.retrieve(dataset, request, target)

"""
print("Executing API request ...")
gp.exec_request(api_request)
print("-----"*10)


Executing API request ...


2025-05-28 10:53:45,454 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-05-28 10:53:46,171 INFO Request ID is 1cee6683-13e9-4835-83c8-6c0deddbaf89
2025-05-28 10:53:46,275 INFO status has been updated to accepted
2025-05-28 10:55:02,255 INFO status has been updated to successful
                                                                                          

--------------------------------------------------




In [6]:
# Download GRUAN DATA through CDS API in netCDF format

api_response_file = "api_response.nc"
api_request = """
import cdsapi

dataset = "insitu-observations-gruan-reference-network"
request = {
    "variable": [
        "air_temperature",
        "relative_humidity",
        "relative_humidity_effective_vertical_resolution",
        "wind_speed",
        "wind_from_direction",
        "eastward_wind_speed",
        "northward_wind_speed",
        "shortwave_radiation",
        "air_pressure",
        "altitude",
        "geopotential_height",
        "frost_point_temperature",
        "water_vapour_volume_mixing_ratio",
        "vertical_speed_of_radiosonde",
        "time_since_launch"
    ],
    "year": "2014",
    "month": "10",
    "day": ["14"],
    "data_format": "netcdf",
    "area": [90, 0, 0, 90]
}

client = cdsapi.Client()
"""+f"""
target= r"{download_folder}\\{api_response_file}" # Change this to your desired output path
client.retrieve(dataset, request, target)

"""
print("Executing API request ...")
gp.exec_request(api_request)
print("-----"*10)

Executing API request ...


2025-05-28 11:16:44,989 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2025-05-28 11:16:45,706 INFO Request ID is f5bfd683-46f5-40fb-bca0-50816aa87b21
2025-05-28 11:16:45,809 INFO status has been updated to accepted
2025-05-28 11:17:35,883 INFO status has been updated to successful
                                                                                         

--------------------------------------------------




# MERGE and AGGREGATE

In [21]:
# GDP RS41 spatial gridding
file_path = r'gdp_demo_090625\LIN-RS-01_2_RS41-GDP_001_20141209T120000_1-009-002.nc'
gdp=gp.read(file_path)

bin_column = 'press' # Choose the binning column (alt or press)
target_columns = ['temp', 'rh']

ggd = gp.spatial_gridding(gdp, bin_column, target_columns)

# Plot original and gridded data
for column in target_columns:
    fig, ax1 = plt.subplots(figsize=(7, 6))
    if bin_column == 'press':
        ax1.set_yscale('log')
        ax1.invert_yaxis()
    ax1.scatter(gdp.data[column], gdp.data[bin_column], label='Original Data', alpha=0.5)
    ax1.scatter(ggd.data[column], ggd.data[bin_column], label='Gridded Data', color='red', alpha=0.5)
    ax1.fill_betweenx(gdp.data[bin_column], gdp.data[column] - gdp.data[column+'_uc'], gdp.data[column] + gdp.data[column+'_uc'], color='blue', alpha=0.2, label='Original Uncertainty')
    ax1.fill_betweenx(ggd.data[bin_column], ggd.data[column] - ggd.data[column+'_uc'], ggd.data[column] + ggd.data[column+'_uc'], color='red', alpha=0.2, label='Gridded Uncertainty')
    ax1.set_xlabel(f'{column.capitalize()} {gdp.variables_attrs[gdp.variables_attrs['variable'] == column]['units'].values[0]}')
    ax1.set_ylabel(f'{bin_column.capitalize()} {gdp.variables_attrs[gdp.variables_attrs['variable'] == bin_column]['units'].values[0]}')
    ax1.legend()
    long_name= gdp.variables_attrs[gdp.variables_attrs['variable'] == column]['long_name'].values[0]
    ax1.set_title(f'{long_name} Mandatory Levels Spatial Gridding')

    plt.tight_layout()
    plt.show()

In [8]:
# GDP RS41 temporal gridding

# Read multiple GDP files
gdp_folder=r'gdp_demo_090625\products_RS41-GDP-1_LIN_2017' # Path to the folder
gdp_files = [os.path.join(gdp_folder, f) for f in os.listdir(gdp_folder) if f.endswith('.nc')]
gdps=[]
for file in tqdm(gdp_files[:50] , desc="Reading GDPs"):
    gdps.append(gp.read(file))

# Uniform Spatial gridding accross multiple GDPs
ggds=[]
target_columns = ['temp', 'rh']
for gdp in tqdm(gdps, desc="Spatial Gridding"):
    ggd = gp.spatial_gridding(gdp, 'press', target_columns, mandatory_levels_flag=True)
    ggds.append(ggd)

# Temporal gridding
tggd = gp.temporal_gridding(ggds, target_columns, bin_size=7, lvl_column='press')

# Plotting the temporal gridded data

Reading GDPs: 100%|██████████| 50/50 [01:10<00:00,  1.42s/it]
Spatial Gridding: 100%|██████████| 50/50 [00:01<00:00, 37.03it/s]


In [None]:
for lvl in tggd.data['mand_lvl'].unique():
    # NOT WORKING YET
    # need to fix all the gridding methods and add a just merge method
    mggd=pd.DataFrame()
    for ggd in ggds:
        start_time_str = ggd.metadata[ggd.metadata['Attribute'] == 'g.Measurement.StartTime']['Value'].values[0]
        start_time = datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S.%fZ")
        ggd_data = ggd.data.copy()
        ggd_data['time'] = start_time
        mggd = pd.concat([mggd, ggd_data[ggd_data['mand_lvl'] == lvl]], ignore_index=True)
    
    for column in target_columns:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.plot(mggd['time'], mggd[column], marker='o', linestyle='-', label=column)
        ax.plot(tggd.data[tggd.data['mand_lvl'] == lvl]['time'], tggd.data[tggd.data['mand_lvl'] == lvl][column], marker='x', linestyle='--', color='red', label=f'Temporal Gridded {column}')
        ax.fill_between(mggd['time'], mggd[column] - mggd[column+'_uc'], mggd[column] + mggd[column+'_uc'], alpha=0.2)
        ax.set_xlabel('Time')
        #ax.set_ylabel(f'{column.capitalize()} {ggd.variables_attrs[ggd.variables_attrs["variable"] == column]["units"].values[0]}')
        ax.set_title(f'Temporal Gridded {column} at Mandatory Level {lvl}')
        ax.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    #print(mggd.columns)
    #print(mggd.shape)
    break

In [7]:
len(ggds)

2

In [4]:
tggd.data.head()

Unnamed: 0,time_bin,press,temp,rh,mand_lvl,time,temp_uc_ucor_avg,temp_var,temp_uc_sc,temp_uc_ucor,temp_cor,temp_uc,rh_uc_ucor_avg,rh_var,rh_uc_sc,rh_uc_ucor,rh_cor,rh_uc
0,3.5,5.972275,216.646744,1.314128,5,2017-01-06 10:45:18.235,0.137911,0.0,0.0,0.137911,0.079224,0.159047,0.052588,0.0,0.0,0.052588,2.098747,2.099405
1,3.5,7.040342,212.783279,1.692838,7,2017-01-06 10:45:18.235,0.072011,0.0,0.0,0.072011,0.080095,0.107707,0.016289,0.0,0.0,0.016289,2.191129,2.19119
2,3.5,7.616255,219.176804,1.553187,7,2017-01-06 10:45:18.235,0.131919,0.0,0.12191,0.179623,0.189554,0.261143,0.017718,0.0,0.0,0.017718,1.175712,1.175845
3,3.5,11.141134,201.235672,3.311255,10,2017-01-06 10:45:18.235,0.125646,0.0,0.0,0.125646,0.083302,0.150752,0.038273,0.0,0.0,0.038273,2.621775,2.622054
4,3.5,11.370716,211.254333,2.151662,10,2017-01-06 10:45:18.235,0.080735,0.0,0.097381,0.126496,0.131397,0.182391,0.023049,0.0,0.0,0.023049,1.485718,1.485896


## PIPELINE

# DATA VISUALIZATION