In [1]:
# This is an exploratory notebook to examine algorithm performance. At this point,
# I am testing memory consumption and operational runtime for a couple of Python processes.
# The basic idea is build out a suite of testing functions to evaluate submitted code snippets, 
# as either stand-alone scripts or official Python packages/modules. 
# Ultimately, tests will include examination of content.
#
# The justification for this work includes:
#   1) Accepted code must be memory conservative and 
#      function correctly (as intended).
#   2) Different technologies will be become of interest periodically. We'll need to determine
#      which algorithms are faster, which consume less memory, which produce fewer (lower) errors, etc.
#      


from netCDF4 import Dataset
from memory_profiler import profile
import xarray as xr
import numpy
import os
import io
import yaml
import sys
from pyquickhelper.loghelper import run_cmd
from mlstatpy.nlp.completion import CompletionTrieNode
from line_profiler import LineProfiler

print ('loaded defs')

loaded defs


In [13]:
%%file blank_memoryprofiler.py

# memory usage background + import packages

from memory_profiler import profile

@profile(precision=1)
def test_df():
    return 'test complete'

if __name__ == '__main__':
    test_df()

Overwriting blank_memoryprofiler.py


In [14]:
%%file forloop_memoryprofiler.py

# memory usage merely open netcdf file

from memory_profiler import profile
import xarray as xr
import yaml

# read yaml input file
with open('file_inputs.yml', 'r') as f:
    inputdict = yaml.load(f)
    q_fmt = inputdict['q_fmt']
    chunksize = inputdict['chunksize']
    ncfile = inputdict['ncfile']
    rconpath = inputdict['rconpath']
    fileread = inputdict['fileread']
    
    # read netcdf file using dask arrays
    if chunksize > 0:
        xr_chk = chunksize
    else:
        xr_chk = {}
        
    if fileread:
        ds = xr.open_dataset(ncfile, chunks = xr_chk)
    else:
        ds = 0
    

@profile(precision=1)
def test_df(input_file):
    return 'test complete'

if __name__ == '__main__':
    test_df('file_inputs.yml')

Overwriting forloop_memoryprofiler.py


In [18]:
# Test memory consumption for different chunking sizes (table rows)


# ** test run 0: background + memory monitor package

cmd = sys.executable
cmd += " -m memory_profiler blank_memoryprofiler.py "
out, err = run_cmd(cmd, wait=True)
print("\n{}".format('Memory use background + import memory monitor'))
print("\n{}".format(out))
print("\n{}".format(err))


# ** test run 1: For loop ONLY

# yaml input file
input_data = {'chunksize': 10000,
        'rconpath' : '/Users/twellman/erddap_data/nc/test/appended.csv',
        'ncfile' : '/Users/twellman/erddap_data/nc/NOAA_CoralReefMonitoring_LPIPercentCover_occurrence_redo_tpw_redo_tpw.nc',
        'q_fmt' : ['"', '"{}"', "'"],
        'fileread': False}

with open('file_inputs.yml', 'w') as outfile:
    yaml.dump(input_data, outfile, default_flow_style=False)

cmd = sys.executable
cmd += " -m memory_profiler forloop_memoryprofiler.py "
out, err = run_cmd(cmd, wait=True)
print("\n{}".format('Memory use import packages + blank for loop'))
print("\n{}".format(out))
print("\n{}".format(err))


# ** test run 2: For loop + ~40mb netcdf file read

# yaml input file
input_data = {'chunksize': False,
        'rconpath' : '/Users/twellman/erddap_data/nc/test/appended.csv',
        'ncfile' : '/Users/twellman/erddap_data/nc/NOAA_CoralReefMonitoring_LPIPercentCover_occurrence_redo_tpw_redo_tpw.nc',
         'q_fmt' : ['"', '"{}"', "'"],
        'fileread': True}

with open('file_inputs.yml', 'w') as outfile:
    yaml.dump(input_data, outfile, default_flow_style=False)

cmd = sys.executable
cmd += " -m memory_profiler forloop_memoryprofiler.py "
out, err = run_cmd(cmd, wait=True)
print("\n{}".format('Memory use for loop + netcdf lazy file read (~40mb)'))
print("\n{}".format(out))
print("\n{}".format(err))


Memory use background + import memory monitor

Filename: blank_memoryprofiler.py

Line #    Mem usage    Increment   Line Contents
     8     39.1 MiB      0.0 MiB   @profile(precision=1)
     9                             def test_df():
    10     39.1 MiB      0.0 MiB       return 'test complete'






Memory use import packages + blank for loop

Filename: forloop_memoryprofiler.py

Line #    Mem usage    Increment   Line Contents
    29     74.0 MiB      0.0 MiB   @profile(precision=1)
    30                             def test_df(input_file):
    31     74.0 MiB      0.0 MiB       return 'test complete'






Memory use for loop + netcdf lazy file read (~40mb)

Filename: forloop_memoryprofiler.py

Line #    Mem usage    Increment   Line Contents
    29     80.1 MiB      0.0 MiB   @profile(precision=1)
    30                             def test_df(input_file):
    31     80.1 MiB      0.0 MiB       return 'test complete'







In [42]:
%%file memoryprofiler.py

# Create python script of netcdf to csv processsing function.

from netCDF4 import Dataset
from memory_profiler import profile
import xarray as xr
import numpy
import os
import sys
import yaml

    
# Add netcdf blocks to csv
@profile(precision=1)
def build_df(input_file):
    
    # read yaml inputs
    with open(input_file, 'r') as f:
        inputdict = yaml.load(f)
        q_fmt = inputdict['q_fmt']
        chunksize = inputdict['chunksize']
        ncfile = inputdict['ncfile']
        rconpath = inputdict['rconpath']

    # read netcdf file using dask arrays
    if chunksize > 0:
        xr_chk = chunksize
    else:
        xr_chk = {}
    ds = xr.open_dataset(ncfile, chunks = xr_chk)

    # determine data processing step - multiple chunks or complete read
    nrows = len(ds.variables['index'])
    if chunksize > 0:
        ndiv = int(numpy.ceil(nrows/chunksize))
        csize = chunksize
    else:
        ndiv = 1
        csize = nrows
    
    # convert netcdf to csv through dataframe manipulation
    os.remove(rconpath) 
    if ndiv == 1:
        print('Dataset processed in one read: ')
        with open(rconpath, 'a') as csvfile:
            df = ds.to_dataframe()
            df.to_csv(csvfile, header=True)
    else:
        print('Dataset processed in chunks: ', ndiv)
        f = True
        with open(rconpath, 'a') as csvfile:
            for n in numpy.arange(0, ndiv): 
                if f:
                    sub = ds.sel(index=slice(n*csize, (n+1)*csize-1))
                    df = sub.to_dataframe()
                    df.to_csv(csvfile, header=True)
                    f = False
                else:
                    sub = ds.sel(index=slice(n*csize, (n+1)*csize-1))
                    df = sub.to_dataframe()
                    df.to_csv(csvfile, header=False)

if __name__ == '__main__':
    build_df('file_inputs.yml')  


Overwriting memoryprofiler.py


In [43]:

# create initial yaml input file, save to local folder
input_data = {'chunksize': 0,
        'rconpath' : '/Users/twellman/erddap_data/nc/test/appended.csv',
        'ncfile' : '/Users/twellman/erddap_data/nc/NOAA_CoralReefMonitoring_LPIPercentCover_occurrence_redo_tpw_redo_tpw.nc',
         'q_fmt' : ['"', '"{}"', "'"] }

# Test memory consumption for different chunking sizes (table rows)
for c in [False, 10000, 1000]:
    input_data['chunksize'] = c
    with open('file_inputs.yml', 'w') as outfile:
        yaml.dump(input_data, outfile, default_flow_style=False)
    input_data.keys()
    cmd = sys.executable
    cmd += " -m memory_profiler memoryprofiler.py "
    out, err = run_cmd(cmd, wait=True)
    print("\n{}".format(out))


Dataset processed in one read: 
Filename: memoryprofiler.py

Line #    Mem usage    Increment   Line Contents
    14     78.3 MiB      0.0 MiB   @profile(precision=1)
    15                             def build_df(input_file):
    16                                 
    17                                 # read yaml inputs
    18     78.3 MiB      0.0 MiB       with open(input_file, 'r') as f:
    19     78.4 MiB      0.1 MiB           inputdict = yaml.load(f)
    20     78.4 MiB      0.0 MiB           q_fmt = inputdict['q_fmt']
    21     78.4 MiB      0.0 MiB           chunksize = inputdict['chunksize']
    22     78.4 MiB      0.0 MiB           ncfile = inputdict['ncfile']
    23     78.4 MiB      0.0 MiB           rconpath = inputdict['rconpath']
    24                             
    25                                 # read netcdf file using dask arrays
    26     78.4 MiB      0.0 MiB       if chunksize > 0:
    27                                     xr_chk = chunksize
    28

In [37]:
# repeat function in notebook cell to run line profiler


# Add netcdf blocks to csv
def build_df2(input_file):
    
    # read yaml inputs
    with open(input_file, 'r') as f:
        inputdict = yaml.load(f)
        q_fmt = inputdict['q_fmt']
        chunksize = inputdict['chunksize']
        ncfile = inputdict['ncfile']
        rconpath = inputdict['rconpath']

    # read netcdf file using dask arrays
    if chunksize > 0:
        xr_chk = chunksize
    else:
        xr_chk = {}
    ds = xr.open_dataset(ncfile, chunks = xr_chk)

    # determine data processing step - multiple chunks or complete read
    nrows = len(ds.variables['index'])
    if chunksize > 0:
        ndiv = int(numpy.ceil(nrows/chunksize))
        csize = chunksize
    else:
        ndiv = 1
        csize = nrows
    
    # convert netcdf to csv through dataframe manipulation
    os.remove(rconpath) 
    if ndiv == 1:
        print('Dataset processed in one read: ')
        with open(rconpath, 'a') as csvfile:
            df = ds.to_dataframe()
            df.to_csv(csvfile, header=True)
    else:
        print('Dataset processed in chunks: ', ndiv)
        f = True
        with open(rconpath, 'a') as csvfile:
            for n in numpy.arange(0, ndiv): 
                if f:
                    sub = ds.sel(index=slice(n*csize, (n+1)*csize-1))
                    df = sub.to_dataframe()
                    df.to_csv(csvfile, header=True)
                    f = False
                else:
                    sub = ds.sel(index=slice(n*csize, (n+1)*csize-1))
                    df = sub.to_dataframe()
                    df.to_csv(csvfile, header=False)
                    
print('def loaded')

def loaded


In [41]:
# create initial yaml input file, save to local folder
input_data = {'chunksize': 100000,
        'rconpath' : '/Users/twellman/erddap_data/nc/test/appended.csv',
        'ncfile' : '/Users/twellman/erddap_data/nc/NOAA_CoralReefMonitoring_LPIPercentCover_occurrence_redo_tpw_redo_tpw.nc',
         'q_fmt' : ['"', '"{}"', "'"] }

with open('file_inputs.yml', 'w') as outfile:
    yaml.dump(input_data, outfile, default_flow_style=False)
    

prof = LineProfiler()
prof.add_function(build_df2)
prof.run("build_df2('file_inputs.yml')")
st = io.StringIO()
prof.print_stats(stream=st)
rem = os.path.normpath(os.path.join(os.getcwd(), "..", "..", ".."))
res = st.getvalue().replace(rem, "")
print(res)

Dataset processed in one read: 
Timer unit: 1e-06 s

Total time: 2.25438 s
File: <ipython-input-37-ea7267e9f5f6>
Function: build_df2 at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           def build_df2(input_file):
     6                                               
     7                                               # read yaml inputs
     8         1          169    169.0      0.0      with open(input_file, 'r') as f:
     9         1         6407   6407.0      0.3          inputdict = yaml.load(f)
    10         1            2      2.0      0.0          q_fmt = inputdict['q_fmt']
    11         1            1      1.0      0.0          chunksize = inputdict['chunksize']
    12         1            1      1.0      0.0          ncfile = inputdict['ncfile']
    13         1          169    169.0      0.0          rconpath = inputdict['rconpath']
    14                                           
    15               

In [40]:
# create initial yaml input file, save to local folder
input_data = {'chunksize': False,
        'rconpath' : '/Users/twellman/erddap_data/nc/test/appended.csv',
        'ncfile' : '/Users/twellman/erddap_data/nc/NOAA_CoralReefMonitoring_LPIPercentCover_occurrence_redo_tpw_redo_tpw.nc',
         'q_fmt' : ['"', '"{}"', "'"] }

with open('file_inputs.yml', 'w') as outfile:
    yaml.dump(input_data, outfile, default_flow_style=False)
    

prof = LineProfiler()
prof.add_function(build_df2)
prof.run("build_df2('file_inputs.yml')")
st = io.StringIO()
prof.print_stats(stream=st)
rem = os.path.normpath(os.path.join(os.getcwd(), "..", "..", ".."))
res = st.getvalue().replace(rem, "")
print(res)

Dataset processed in one read: 
Timer unit: 1e-06 s

Total time: 2.17493 s
File: <ipython-input-37-ea7267e9f5f6>
Function: build_df2 at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           def build_df2(input_file):
     6                                               
     7                                               # read yaml inputs
     8         1          178    178.0      0.0      with open(input_file, 'r') as f:
     9         1         5607   5607.0      0.3          inputdict = yaml.load(f)
    10         1            7      7.0      0.0          q_fmt = inputdict['q_fmt']
    11         1            1      1.0      0.0          chunksize = inputdict['chunksize']
    12         1            1      1.0      0.0          ncfile = inputdict['ncfile']
    13         1          709    709.0      0.0          rconpath = inputdict['rconpath']
    14                                           
    15               