(C) Crown Copyright, Met Office. All rights reserved.

## esgf_vars_downloaded.ipynb

In [1]:
import datetime
print(f'Last run {datetime.datetime.utcnow()}')

Last run 2020-06-30 11:51:38.314441


In [2]:
from collections import OrderedDict

In [3]:
# The path to the Tomcar log file
LOGFILE = 'download-logs'

In [4]:
class VariableRequests:
    """
    A class to represent the variable requests and to keep track of how many
    times each of them has been requested
    """
    def __init__(self):
        """Create an empty dict of variable requests"""
        self._vreqs = {}

    def increment_vreq(self, vreq):
        """
        Increment the retrieval count for `vreq`, adding it to the list if
        it doesn't alreday exist.

        :param str vreq: the variable request code
        """
        if vreq not in self._vreqs:
            self._vreqs[vreq] = 1
        else:
            self._vreqs[vreq] += 1

    def get_vreqs(self, order_by_count=False):
        """
        Get the list and count of variable requests. The default order of
        requests is frequency, then table and finally variable name. They
        can alternatively be returned in decreasing count order.

        :param bool order_by_count: if True then return in decreasing order
            of count.
        :returns: the list and count of variable requests.
        :rtype: str
        """
        return_strings = []
        if order_by_count:
            ordered = OrderedDict(sorted(self._vreqs.items(),
                                         key=lambda x: x[1],
                                         reverse=True))
        else:
            ordered = OrderedDict(
                sorted(self._vreqs.items(),
                       # sort order is frequency then table name then variable
                       key=lambda x: (_guess_frequency(x[0]),
                                      x[0].split('_')[1],
                                      x[0].split('_')[0]))
            )
        for vr in ordered:
            return_strings.append('{:<25} {:3}'.format(vr, self._vreqs[vr]))

        return '\n'.join(return_strings)


In [5]:
def _guess_frequency(table_name):
    """
    Return an integer corresponding to the frequency of variables in the table.
    Higher frequency data (starting at 1hr) has a lower priority.

    :param str table_name: a string containing the table name.
    :returns: an integer corresponding to the frequency of variables.
    :rtype: int
    :raises ValueError: if a valid frequency isn't found in the table name.
    """
    frequencies = {
        '1hr': 1,
        '3hr': 2,
        '6hr': 3,
        'day': 4,
        'mon': 5,
        'fx': 6
    }
    for freq in frequencies:
        if freq in table_name:
            return frequencies[freq]

    raise ValueError(f'No frequency found for table name {table_name}')


In [6]:
dreqs = {}
vreqs = VariableRequests()
num_bad_request = 0
num_other_requests = 0
num_lines_done = 0

with open(LOGFILE) as fh:
    for line in fh:
        num_lines_done += 1
        cmpts = line.split()
        ip_hash = cmpts[0]
        url = cmpts[6]
        status = cmpts[-2]

        if status != '200':
            # bad request so ignore and move to next
            num_bad_request += 1
            continue

        if not url.startswith('/thredds/fileServer/esg_cmip6'):
            # not a file retrieval so ignore and move to next
            num_other_requests += 1
            continue

        url_parts = url.split('/')
        # The data request code is in the form:
        # institute_id/source_id/variant_label/table_name/cmor_name
        dreq = '/'.join(url_parts[6:12])
        # The variable request code is in the form:
        # cmor_name_table_name
        vreq = f'{url_parts[11]}_{url_parts[10]}'
        if dreq not in dreqs:
            # This data request hasn't been requested before
            dreqs[dreq] = [ip_hash]
            vreqs.increment_vreq(vreq)
        elif ip_hash not in dreqs[dreq]:
            # This IP address hasn't requested this data request before
            dreqs[dreq].append(ip_hash)
            vreqs.increment_vreq(vreq)

print(f'{num_lines_done} lines processed')

2216976 lines processed


Look at the variables downloaded in frequency and table order

In [7]:
print(vreqs.get_vreqs())

pr_E1hr                    48
prc_E1hr                   38
clt_3hr                     7
hfls_3hr                    4
hfss_3hr                    6
huss_3hr                   38
mrsos_3hr                   1
pr_3hr                    107
prc_3hr                     7
prsn_3hr                    5
ps_3hr                      3
rlds_3hr                   13
rldscs_3hr                  1
rlus_3hr                   14
rsds_3hr                   13
rsdscs_3hr                 12
rsus_3hr                   12
rsuscs_3hr                 14
tas_3hr                    77
tos_3hr                     5
tslsi_3hr                   4
uas_3hr                    63
vas_3hr                    60
psl_CF3hr                   3
prcsh_E3hr                  2
prw_E3hr                   14
psl_E3hr                   36
rlut_E3hr                   1
rlutcs_E3hr                 1
rsut_E3hr                   1
rsutcs_E3hr                 9
hus_E3hrPt                  7
ta_E3hrPt                   7
ua_E3hrPt 

Look at the variables downloaded in popularity order

In [8]:
print(vreqs.get_vreqs(order_by_count=True))

pr_Amon                   716
tas_Amon                  627
pr_day                    545
psl_Amon                  342
ua_Amon                   317
ta_Amon                   303
ts_Amon                   294
uas_Amon                  291
va_Amon                   289
vas_Amon                  281
zg_Amon                   259
tas_day                   235
hus_Amon                  223
tasmax_day                218
ps_Amon                   214
tasmin_day                203
ta_day                    168
ua_day                    163
hus_day                   158
psl_day                   155
uas_day                   152
va_day                    145
huss_Amon                 144
vas_day                   139
wap_Amon                  136
evspsbl_Amon              133
mrro_Lmon                 130
prw_Amon                  121
rsds_Amon                 119
rlds_Amon                 119
tauu_Amon                 111
hfls_Amon                 109
hfss_Amon                 109
tauv_Amon 