# SOMMAR Data Volume Estimates from PRIMAVERA

In [1]:
import django
django.setup()
from django.db.models import Sum
from pdata_app.models import DataRequest, DataFile
from pdata_app.utils.common import filter_hadgem_stream2, get_request_size

In [2]:
def filesizeformat(num_bytes, human_units=True):
    """
    If human_units then output data volumes with units, but with the non-breaking space removed.
    """
    if human_units:
        return django.template.defaultfilters.filesizeformat(num_bytes).replace('\xa0', ' ')
    else:
        return num_bytes

### Total data volume for a Stream 2 MOHC experiment

The PRIMAVERA Stream 2 data request was cut down from the Stream 1 request so that only the high frequency variables required for the user facing work packages were output. The output of daily variables on atmosphere levels was also reduced. The Stream 2 data request is summarised at https://doi.org/10.5281/zenodo.3607328.

In [3]:
hist_reqs = filter_hadgem_stream2(
    DataRequest.objects.filter(
        climate_model__short_name='HadGEM3-GC31-HH',
        experiment__short_name='hist-1950',
        rip_code='r1i1p1f1',
        datafile__isnull=False
    )
)
ocean_table_names = ['Oday', 'PrimOday', 'PrimSIday', 'SIday',
                     'Omon', 'PrimOmon', 'SImon']

The following variables were output (along with their dimensions):

In [4]:
for dr in hist_reqs.order_by('variable_request__frequency', 'variable_request__table_name', 
                             'variable_request__cmor_name'):
    var_string = f'{dr.variable_request.table_name}_{dr.variable_request.cmor_name}'
    print(f'{var_string:<30} ({dr.variable_request.dimensions})')

3hr_pr                         (longitude latitude time)
3hr_rsds                       (longitude latitude time)
3hr_rsdsdiff                   (longitude latitude time)
3hr_tas                        (longitude latitude time1)
3hr_uas                        (longitude latitude time1)
3hr_vas                        (longitude latitude time1)
E3hr_psl                       (longitude latitude time)
E3hrPt_ua7h                    (longitude latitude plev7h time1)
E3hrPt_va7h                    (longitude latitude plev7h time1)
Prim3hr_sfcWind                (longitude latitude height10m time)
Prim3hr_sfcWindmax             (longitude latitude height10m time)
Prim3hrPt_ua100m               (longitude latitude height100m time1)
Prim3hrPt_ua50m                (longitude latitude height50m time1)
Prim3hrPt_va100m               (longitude latitude height100m time1)
Prim3hrPt_va50m                (longitude latitude height50m time1)
Prim3hrPt_zg7h                 (longitude latitude plev7h ti

For SOMMAR, this data request would require the addition of a variable for each ocean biogeochemistry tracer on olevels.

The data volume for 65 years for the 25 km atmosphere and 1/12° ocean is:

In [5]:
atmos_reqs = hist_reqs.exclude(variable_request__table_name__in=ocean_table_names)
n512_atmos_size = get_request_size(atmos_reqs, 1950, 2014)
print(f'25 km atmosphere {filesizeformat(n512_atmos_size)}')
ocean_reqs = hist_reqs.filter(variable_request__table_name__in=ocean_table_names)
ocean_size = get_request_size(ocean_reqs, 1950, 2014)
print(f'1/12° ocean {filesizeformat(ocean_size)}')
print(f'total {filesizeformat(get_request_size(hist_reqs, 1950, 2014))}')

25 km atmosphere 18.7 TB
1/12° ocean 29.1 TB
total 47.8 TB


However, in PRIMAVERA the Met Office ran an N512 model and in SOMMAR it would like to run an N1280 model. Let's see what effect this has on the data volume for this 65 year simulation:

In [6]:
n1280_atmos_size = n512_atmos_size * (1280 / 512)**2
print(f'10 km atmosphere {filesizeformat(n1280_atmos_size)}')
print(f'1/12° ocean {filesizeformat(ocean_size)}')
print(f'total {filesizeformat(n1280_atmos_size + ocean_size)}')

10 km atmosphere 116.8 TB
1/12° ocean 29.1 TB
total 146.0 TB


## Data volume per year

SOMMAR will have longer runs and so the data volume per year will be:

In [7]:
n1280_per_year = n1280_atmos_size / 65
print(f'atmosphere per year {filesizeformat(n1280_per_year)}')
ocean_per_year = ocean_size / 65
print(f'ocean per year {filesizeformat(ocean_per_year)}')
total_per_year = n1280_per_year + ocean_per_year
print(f'total per year {filesizeformat(total_per_year)}')

atmosphere per year 1.8 TB
ocean per year 458.9 GB
total per year 2.2 TB


For a possible run length of 650 years (200 spinup, 200 piControl and 250 historical + future):

In [8]:
print(f'atmosphere {filesizeformat(650 * n1280_per_year)}')
print(f'ocean {filesizeformat(650 * ocean_per_year)}')
print(f'total {filesizeformat(650 * total_per_year)}')

atmosphere 1.1 PB
ocean 291.3 TB
total 1.4 PB


## Data Volume for a surface variable

In [9]:
amon_tas = DataRequest.objects.filter(
    climate_model__short_name='HadGEM3-GC31-HH',
    experiment__short_name='hist-1950',
    rip_code='r1i1p1f1',
    variable_request__table_name='Amon',
    variable_request__cmor_name='tas'
)

n1280_size = get_request_size(amon_tas, 1950, 2014) * (1280 / 512)**2
print(f'65 years of a suface variable at N1280 {filesizeformat(n1280_size)}')
n1280_650years_size = n1280_size / 65 * 650
print(f'650 years of a suface variable at N1280 {filesizeformat(n1280_650years_size)}')


65 years of a suface variable at N1280 6.2 GB
650 years of a suface variable at N1280 62.5 GB


Therefore how to analyse the data needs to be considered as the time series for a variable on a single level will be over 60 GB in size. Many post-processing systems won't have this much RAM available to them.

## Data volume per time slice

To allow the estimation of SOMMAR volumes we can calculate the storage required by a single atmosphere and ocean time slice.

In [10]:
atmos_filename = 'tas_Amon_HadGEM3-GC31-HH_hist-1950_r1i1p1f1_gn_195001-195012.nc'
atmos_file = DataFile.objects.get(name=atmos_filename)
print(f'{atmos_filename} {filesizeformat(atmos_file.size)}')

ocean_surface_filename = 'tos_Omon_HadGEM3-GC31-HH_hist-1950_r1i1p1f1_gn_195001-195001.nc'
ocean_surface_file = DataFile.objects.get(name=ocean_surface_filename)
print(f'{ocean_surface_filename} {filesizeformat(ocean_surface_file.size)}')

ocean_filename = 'vo_Omon_HadGEM3-GC31-HH_hist-1950_r1i1p1f1_gn_195001-195001.nc'
ocean_file = DataFile.objects.get(name=ocean_filename)
print(f'{ocean_filename} {filesizeformat(ocean_file.size)}')
print('')

sommar_atmos_filesize = (1280 / 512)**2 * atmos_file.size
sommar_atmos_step_size = sommar_atmos_filesize / 12  # 12 months in the file
print(f'10 km atmos single level single time slice {filesizeformat(sommar_atmos_step_size)}')

ocean_surface_step_size = ocean_surface_file.size # single time point
print(f'1/12° ocean surface single time slice {filesizeformat(ocean_surface_step_size)}')

ocean_step_size = ocean_file.size / 75  # on 75 olevels
print(f'1/12° ocean single level single time slice {filesizeformat(ocean_step_size)}')

tas_Amon_HadGEM3-GC31-HH_hist-1950_r1i1p1f1_gn_195001-195012.nc 15.8 MB
tos_Omon_HadGEM3-GC31-HH_hist-1950_r1i1p1f1_gn_195001-195001.nc 163.7 MB
vo_Omon_HadGEM3-GC31-HH_hist-1950_r1i1p1f1_gn_195001-195001.nc 1.6 GB

10 km atmos single level single time slice 8.2 MB
1/12° ocean surface single time slice 163.7 MB
1/12° ocean single level single time slice 22.4 MB


These values seem quite surprising as there are 3606x4322 points in the ocean file and around 1920x2560 points in the 10 km atmosphere file. However, there is data on all points in the atmosphere but there's only data on the ocean points in the ocean file (47% of points are masked at the surface and 100% are at the bottom layer). The compression used will save space for these masked points. With only a single time point in the ocean surface file, two-thirds of the data will be latitudes and longitudes. Ocean surface storage would have been more efficient if more time points were included in each file. 

Lots of effort needs to go into working out the most efficient way to store the data in SOMMAR!