(C) Crown Copyright, Met Office. All rights reserved.

Identfiy the Stream 1 and 2 data in the DMT and calculate the proportion of data that had been retrieved in terms of the number of datasets and the storage space.

In [1]:
import django
django.setup()

In [2]:
from django.db.models import Sum
from django.template.defaultfilters import filesizeformat
from pdata_app.models import DataFile, DataRequest, ReplacedFile, RetrievalRequest
from pdata_app.utils.common import filter_hadgem_stream2

In [3]:
def volume_from_dreqs(dreqs):
    """
    From a queryset of pdata_app.DataRequest objects return the volume in bytes
    that their files require to store.

    :param django.db.models.query.QuerySet queryset: the data requests
    :returns: the volume in bytes of these data requests
    :rtype:  int    
    """
    data_vol = 0
    for dreq in dreqs:
        data_vol += dreq.datafile_set.aggregate(Sum('size'))['size__sum']
        
    return data_vol

### Find correct Stream 1 and 2 (ignore additional HadGEM)

In [4]:
amip_expts = ['highresSST-present', 'highresSST-future']
coupled_expts = ['spinup-1950', 'hist-1950', 'control-1950',
                 'highres-future']

expts = amip_expts + coupled_expts

# MOHC stream 2 is members r1i2p2f1 to r1i15p1f1
mohc_stream2_members = [f'r1i{init_index}p1f1'
                        for init_index in range(2, 16)]

hadgem_s2 = filter_hadgem_stream2(DataRequest.objects.filter(
    institute__short_name__in=['MOHC', 'NERC'],
    experiment__short_name__in=expts,
    rip_code__in=mohc_stream2_members,
    datafile__isnull=False
))

others = DataRequest.objects.filter(
    experiment__short_name__in=expts,
    datafile__isnull=False
).exclude(
    # Exclude MOHC Stream 2
    institute__short_name__in=['MOHC', 'NERC'],
    rip_code__in=mohc_stream2_members
).exclude(
    # Exclude EC-Earth coupled r1i1p1f1
    institute__short_name='EC-Earth-Consortium',
    experiment__short_name__in=coupled_expts,
    rip_code='r1i1p1f1'
).distinct()

stream_1_2 = hadgem_s2 | others

### Calculate Volumes

In [5]:
num_stream_1_2 = stream_1_2.count()
volume_stream_1_2 = volume_from_dreqs(stream_1_2)

In [6]:
print(f'{num_stream_1_2} data requests were uploaded consisting of '
      f'{filesizeformat(volume_stream_1_2)}')

30955 data requests were uploaded consisting of 1.5 PB


In [7]:
# Ignore data restored by Jon as that was for publication rather than science
retrieval_data_reqs = (RetrievalRequest.objects.exclude(requester__username='jseddon').
                       values_list('data_request', flat=True).distinct())

stream_1_2_restored = DataRequest.objects.filter(id__in=retrieval_data_reqs).filter(
    experiment__short_name__in=expts,
    datafile__isnull=False
).distinct()

In [8]:
num_stream_1_2_restored = stream_1_2_restored.count()
volume_stream_1_2_restored = volume_from_dreqs(stream_1_2_restored)

In [9]:
print(f'{num_stream_1_2_restored} unique data requests were restored, '
      f'{num_stream_1_2_restored / num_stream_1_2:.1%} of the total uploaded')
print(f'{filesizeformat(volume_stream_1_2_restored)} of unique data requests '
      f'were restored, '
      f'{volume_stream_1_2_restored / volume_stream_1_2:.1%} of the volume uploaded')

5941 unique data requests were restored, 19.2% of the total uploaded
417.6 TB of unique data requests were restored, 26.5% of the volume uploaded
