In [1]:
from copernicusmarine import get
import yaml
import subprocess
import os
import logging

## Define get_and_check function
This function tries downloading with `copernicusmarine` toolbox in case a dataset name has been specificed.

Otherwise it tries looking at files in the `data/` folder.

In [29]:
def get_and_check(entry, output_directory, output_file, copernicusmarine=None, cfchecks=None):

    # Configure the logging system
    logging.basicConfig(level=logging.INFO,
                    format='%(levelname)s - %(asctime)s - %(message)s')
    
    # Check if 'dataset' is in entry and is not empty
    if 'dataset' in entry and entry['dataset']:
        try:
            # Download the file using the get function
            args={'dataset_id':entry['dataset'],
                  'output_directory':output_directory,
                  'filter':entry['filename']}
            #Define optional arguments to pass to copernicusmarine API
            if entry.get('copernicusmarine',None):
                args = args | entry['copernicusmarine']
            elif copernicusmarine:
                args = args | copernicusmarine
            logging.info(f"Running copernicusmarine with options: {args}")
            get(**args)
        except Exception as e:
            # Raise an exception with a custom message if the get command fails
            raise RuntimeError(f"Failed to download or process the dataset: {entry['dataset']} - {entry['filename']}\n {e}")

    # Update the filename with the full path
    entry['filename'] = os.path.join(output_directory, entry['filename'])

    # Define the command to run
    command = ['cfchecks']
    if entry.get('cfchecks',None): command+=[entry['cfchecks']]
    elif cfchecks:   command+=[copernicusmarine]           
    command+=[entry['filename']]
    
    try: 
        # Check cfchecker version
        result = subprocess.run('conda list cfchecker | awk \'/^cfchecker/{print $2}\'',shell=True,capture_output=True,text=True)
        #Run cfchecker with arguments
        logging.info(f"Running cfchecks (version:{result.stdout.strip()}) with command: {command}")
        subprocess.run(command, stdout=output_file, stderr=subprocess.STDOUT, text=True)
        
    except Exception as e:
        # Raise an exception with a custom message if the get command fails
        raise RuntimeError(f"Failed to check file: {entry['filename']}\n{e}")

## Setup input parameters:
`output directory` is the name of the directory where data can be found (or downloaded)

`output_file`is the name of the output file containing logs

`input_yaml`is the name of the input YAML file specifying paths to files (if already downloaded) or dataset ID + filename (if download from copernicusmarine is required)

In [3]:
output_directory = 'data/'
output_file = 'input.log'
input_yaml = 'input.yaml'

## Iterate over each entries in input YAML and run checks

In [30]:
# Open input YAML in read and output file in append mode
with open(input_yaml, 'r') as file, open("output.txt", "w") as output_file:
    data = yaml.safe_load(file)
    # Iterate over each entry in the data
    for entry in data['data']:
        try:
            options = data['options'] if data['options'] is not None else None
            get_and_check(entry, output_directory, output_file, **options)
        except RuntimeError as e:
            print(e)

INFO - 2025-03-22 10:05:54,066 - Version of cfchecks : 4.1.0
INFO - 2025-03-22 10:05:54,067 - Running cfchecks with command: ['cfchecks', '-v 1.6', 'data/WAVERYSv1_climatology_19930101_20211231.nc']
