### Download sprof and meta files from GDACs

Code modified from GO-BGC python tutorial: https://github.com/go-bgc/workshop-python/blob/main/GO_BGC_Workshop_Python_tutorial.ipynb


In [1]:
# Import packages
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import requests
import time
import os
import urllib3
import shutil

## Set directories

In [2]:
# read in a user-created text file to point to local directories to avoid having to change this every time 
# we update code
lines=[]
with open('path_file.txt') as f:
    lines = f.readlines()
    
count = 0
for line in lines:
    count += 1
    index = line.find("=")
    #print(f'line {count}: {line}')
    #print(index)
    #print(line[0:index])
    line = line.rstrip()
    if line[0:index].find("argo")>=0:
        root=line[index+1:]
    elif line[0:index].find("liar")>=0:
        liar_dir=line[index+1:]
    elif line[0:index].find("matlab")>=0:
        matlab_dir=line[index+1:]
profile_dir = root
root = root + '../'

#make profile_dir if it doesn't exist
if not os.path.isdir(profile_dir):
    os.mkdir(profile_dir)

## Define functions

### Function to download a single file


In [3]:
def download_file(url_path,filename,save_to=None,overwrite=False,verbose=True):
    """ Downloads and saves a file from a given URL using HTTP protocol.

    Note: If '404 file not found' error returned, function will return without downloading anything.
    
    Arguments:
        url_path: root URL to download from including trailing slash ('/')
        filename: filename to download including suffix
        save_to: None (to download to root Google Drive GO-BGC directory)
                 or directory path
        overwrite: False to leave existing files in place
                   or True to overwrite existing files
        verbose: True to announce progress
                 or False to stay silent
    
    """
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    if save_to is None:
        save_to = root

    try:
        if filename in os.listdir(save_to):
            if not overwrite:
                if verbose: print('>>> File ' + filename + ' already exists. Leaving current version.')
                return
            else:
                if verbose: print('>>> File ' + filename + ' already exists. Overwriting with new version.')

        def get_func(url,stream=True):
            try:
                return requests.get(url,stream=stream,auth=None,verify=False)
            except requests.exceptions.ConnectionError as error_tag:
                print('Error connecting:',error_tag)
                time.sleep(1)
                return get_func(url,stream=stream)

        response = get_func(url_path + filename,stream=True)

        if response.status_code == 404:
            if verbose: print('>>> File ' + filename + ' returned 404 error during download.')
            return
        with open(save_to + filename,'wb') as out_file:
            shutil.copyfileobj(response.raw,out_file)
        del response
        if verbose: 
            print('>>> Successfully downloaded ' + filename + '.')

    except:
        if verbose: 
            print('>>> An error occurred while trying to download ' + filename + '.')

### Function to download and parse GDAC synthetic profile index file


In [4]:
def argo_gdac(lat_range=None,lon_range=None,start_date=None,end_date=None,sensors=None,floats=None,
              overwrite_index=False,overwrite_profiles=False,skip_download=False,
              download_individual_profs=False,download_meta=False,save_to=None,verbose=True):
    """ Downloads GDAC Sprof index file, then selects float profiles based on criteria.
      Either returns information on profiles and floats (if skip_download=True) or downloads them (if False).

      Arguments:
          lat_range: None, to select all latitudes
                     or [lower, upper] within -90 to 90 (selection is inclusive)
          lon_range: None, to select all longitudes
                     or [lower, upper] within either -180 to 180 or 0 to 360 (selection is inclusive)
                     NOTE: longitude range is allowed to cross -180/180 or 0/360
          start_date: None or datetime object
          end_date:   None or datetime object
          sensors: None, to select profiles with any combination of sensors
                   or string or list of strings to specify required sensors
                   > note that common options include PRES, TEMP, PSAL, DOXY, CHLA, BBP700,
                                                      PH_IN_SITU_TOTAL, and NITRATE
          floats: None, to select any floats matching other criteria
                  or int or list of ints specifying floats' WMOID numbers
          overwrite_index: False to keep existing downloaded GDAC index file, or True to download new index
          overwrite_profiles: False to keep existing downloaded profile files, or True to download new files
          skip_download: True to skip download and return: (<list of WMOIDs>, <DataFrame of index file subset>,
                                                            <list of downloaded filenames [if applicable]>)
                         or False to download those profiles
          download_individual_profs: False to download single Sprof file containing all profiles for each float
                                     or True to download individual profile files for each float
          save_to: None to download to Google Drive "/GO-BGC Workshop/Profiles" directory
                   or string to specify directory path for profile downloads
          verbose: True to announce progress, or False to stay silent

    """
    # Paths
    url_root = 'https://www.usgodae.org/ftp/outgoing/argo/'
    dac_url_root = url_root + 'dac/'
    index_filename = 'argo_synthetic-profile_index.txt'
    if save_to is None: save_to = root

    # Download GDAC synthetic profile index file
    download_file(url_root,index_filename,overwrite=overwrite_index)

    # Load index file into Pandas DataFrame
    gdac_index = pd.read_csv(root + index_filename,delimiter=',',header=8,parse_dates=['date','date_update'],
                          date_parser=lambda x: pd.to_datetime(x,format='%Y%m%d%H%M%S'))

    # Establish time and space criteria
    if lat_range is None:  lat_range = [-90.0,90.0]
    if lon_range is None:  lon_range = [-180.0,180.0]
    elif lon_range[0] > 180 or lon_range[1] > 180:
        if lon_range[0] > 180: lon_range[0] -= 360
        if lon_range[1] > 180: lon_range[1] -= 360
    if start_date is None: start_date = datetime(1900,1,1)
    if end_date is None:   end_date = datetime(2200,1,1)

    float_wmoid_regexp = r'[a-z]*/[0-9]*/profiles/[A-Z]*([0-9]*)_[0-9]*[A-Z]*.nc'
    gdac_index['wmoid'] = gdac_index['file'].str.extract(float_wmoid_regexp).astype(int)
    filepath_main_regexp = '([a-z]*/[0-9]*/)profiles/[A-Z]*[0-9]*_[0-9]*[A-Z]*.nc'
    gdac_index['filepath_main'] = gdac_index['file'].str.extract(filepath_main_regexp)
    filepath_regexp = '([a-z]*/[0-9]*/profiles/)[A-Z]*[0-9]*_[0-9]*[A-Z]*.nc'
    gdac_index['filepath'] = gdac_index['file'].str.extract(filepath_regexp)
    filename_regexp = '[a-z]*/[0-9]*/profiles/([A-Z]*[0-9]*_[0-9]*[A-Z]*.nc)'
    gdac_index['filename'] = gdac_index['file'].str.extract(filename_regexp)

    # Subset profiles based on time and space criteria
    gdac_index_subset = gdac_index.loc[np.logical_and.reduce([gdac_index['latitude'] >= lat_range[0],
                                                            gdac_index['latitude'] <= lat_range[1],
                                                            gdac_index['date'] >= start_date,
                                                            gdac_index['date'] <= end_date]),:]
    if lon_range[1] >= lon_range[0]:    # range does not cross -180/180 or 0/360
        gdac_index_subset = gdac_index_subset.loc[np.logical_and(gdac_index_subset['longitude'] >= lon_range[0],
                                                             gdac_index_subset['longitude'] <= lon_range[1])]
    elif lon_range[1] < lon_range[0]:   # range crosses -180/180 or 0/360
        gdac_index_subset = gdac_index_subset.loc[np.logical_or(gdac_index_subset['longitude'] >= lon_range[0],
                                                            gdac_index_subset['longitude'] <= lon_range[1])]

    # If requested, subset profiles using float WMOID criteria
    if floats is not None:
        if type(floats) is not list: floats = [floats]
        gdac_index_subset = gdac_index_subset.loc[gdac_index_subset['wmoid'].isin(floats),:]

    # If requested, subset profiles using sensor criteria
    if sensors is not None:
        if type(sensors) is not list: sensors = [sensors]
        for sensor in sensors:
            gdac_index_subset = gdac_index_subset.loc[gdac_index_subset['parameters'].str.contains(sensor),:]

    # Examine subsetted profiles
    wmoids = gdac_index_subset['wmoid'].unique()
    wmoid_filepaths = gdac_index_subset['filepath_main'].unique()

    # Just return list of floats and DataFrame with subset of index file, or download each profile
    if not skip_download:
        downloaded_filenames = []
        if download_individual_profs:
            for p_idx in gdac_index_subset.index:
                download_file(dac_url_root + gdac_index_subset.loc[p_idx]['filepath'],
                      gdac_index_subset.loc[p_idx]['filename'],
                      save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)
                downloaded_filenames.append(gdac_index_subset.loc[p_idx]['filename'])
        else:
            for f_idx, wmoid_filepath in enumerate(wmoid_filepaths):
                download_file(dac_url_root + wmoid_filepath,str(wmoids[f_idx]) + '_Sprof.nc',
                      save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)
                downloaded_filenames.append(str(wmoids[f_idx]) + '_Sprof.nc')
                if download_meta:
                    download_file(dac_url_root + wmoid_filepath,str(wmoids[f_idx]) + '_meta.nc',
                      save_to=save_to,overwrite=overwrite_profiles,verbose=verbose)
        return wmoids, gdac_index_subset, downloaded_filenames
    else:
        return wmoids, gdac_index_subset

## Run these three cells to download all floats with DOXY, PH, or NITRATE data

In [None]:
# downloads argo_synthetic_profile_index.txt to data folder. This file has a list of all synthetic profiles, which should capture all bgc argo data
#get wmo indices of BGC floats (DOXY, PH_IN_SITU_TOTAL, NITRATE)
wmoids_doxy, gdac_index = argo_gdac(sensors='DOXY',floats=None,
                               overwrite_index=True,
                               skip_download=True)
wmoids_ph, gdac_index = argo_gdac(sensors='PH_IN_SITU_TOTAL',floats=None,
                               skip_download=True)
wmoids_nitrate, gdac_index = argo_gdac(sensors='NITRATE',floats=None,
                               skip_download=True)

In [None]:
#combine wmoids to one unique list
wmoids = np.concatenate((wmoids_doxy,wmoids_ph,wmoids_nitrate))
wmoids_all = np.unique(wmoids)
wmoids_all = wmoids_all[~np.isnan(wmoids_all)].tolist()
print(len(wmoids_all))

In [None]:
#re-run argo_gdac with full list and download Sprof and meta files
wmoids_bgc, gdac_index, downloaded_filenames = argo_gdac(floats=wmoids_all,
                               skip_download=False, download_meta=True,save_to=profile_dir)