# Data Handling Workflows

Note that Parsl is not effective if multiple CPU cores aren't available because Parsl's ability to execute tasks in parallel is depenedent on the availability multiple cores.

In [1]:
import multiprocessing
print('Cores available: {}'.format(multiprocessing.cpu_count()))

Cores available: 4


### Importing Libraries and Configuration

In [2]:
import numpy as np
import random
import time
import pandas as pd
import parsl
import os
import montage_wrapper as montage
from parsl.data_provider.files import File
cwd = os.getcwd()

from parsl.app.app import python_app, bash_app
from parsl.providers import LocalProvider
from parsl.channels import LocalChannel

from parsl.config import Config
from parsl.executors import HighThroughputExecutor

config = Config(
    executors=[
        HighThroughputExecutor(
            label="htex_local",
            cores_per_worker=1,
            provider=LocalProvider(
                channel=LocalChannel(),
                init_blocks=1,
                max_blocks=1,
            ),
        )
    ],
)

parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x118da8d90>

### Map Reduce

A map reduce is a technique to execute multiple parallel jobs on a dataset to reduce the size of the dataset before executing a final function to get the result. A Map reduce is a more complicated version of synchronisation.

Let's consider a simple example where we are given multiple lists and we want to select the lists with the highest standard deviation.

![](./images/map_reduce.png)

In [3]:
@python_app
def standard_deviation(inputs=[]):
    '''
    A python app to compute the standard deviation of the inputs
    '''
    import numpy as np
    return np.std(inputs)

In [4]:
def make_data():
    '''
    A function to construct data that is a list of lists, each each list having 100 random numbers.
    '''
    lists = []
    
    for _ in range(100):
        new_list = []
        
        for __ in range(100):
            new_list.append(random.random()*100)
            
        lists.append(new_list)
    return lists

our_data = make_data()

### Parallel Execution

In [5]:
'''
Computing the standard deviations for each list
'''

start1 = time.time()

standard_deviations = []

for i in our_data:
    standard_deviations.append(standard_deviation(inputs=i))

'''
Finding the maximum standard deviation
'''

standard_deviations = [i.result() for i in standard_deviations]
print('Maximum Standard Deviation: ', max(standard_deviations))

'''
Finding the list with the maximum standard deviation
'''

maximum = max(standard_deviations)
print('Target List Number:', standard_deviations.index(maximum))

end1 = time.time()

Maximum Standard Deviation:  31.65997545363917
Target List Number: 44


# Hashing

We'll be using a simple hash function to store elements in our database. We'll evaluate the hash values in parallel and then store the items in those locations.

In [6]:
'''
An empty database
'''
database = [0 for i in range(1000)]  

In [7]:
@python_app 
def hash_function(element):
    '''
    We import the haslib library and then create the hash index.
    '''
    
    import hashlib
    number = int(hashlib.md5(element).hexdigest()[:8], 16)%1000
    return number

### Parallel Execution

In [8]:
import random

elements = []

for i in range(100):
    '''
    Making a 5 letter element and collecting 100 such elements.
    '''
    element = '' 
    for _ in range(5):
        element += random.choice('abcdefghijklmopqrstuvwxyz')
    element = element.encode()
    elements.append(element)

start1 = time.time()   
hashes = []
for i in elements:
    '''
    Updating the database for all the elements.
    '''
    hashes.append(hash_function(i))

hashes = [i.result() for i in hashes]

for i in range(len(elements)):
    database[hashes[i]] = elements[i]

end1 = time.time()

Note that this still doesn't solve the problem of overlap of elements. Chaining is the alternative here but in order to implement chaining, we have to evaluate the results which breaks the parallel thread.

## Montage Mosaic

This Python script has been inspired from the [Montage Wrapper Documentation](https://montage-wrapper.readthedocs.io/en/v0.9.5) and the [tutorial](http://montage.ipac.caltech.edu/docs/first_mosaic_tutorial.html) for the Montage Mosaic.

### First Part

In [12]:
from IPython.utils import io

with io.capture_output() as captured: 
    '''
    Packaging all the non-parallel commands inside a captured output to prevent printing any outputs here.
    '''
    !tar xvf Kimages.tar
    montage.mImgtbl(os.path.join(cwd,'Kimages/'),  File(os.path.join(cwd,'Kimages.tbl')))
    montage.mMakeHdr(File(os.path.join(cwd,'Kimages.tbl')), File(os.path.join(cwd,'Ktemplate.hdr')))
    os.mkdir(os.path.join(cwd,'Kprojdir/'))

Implementation of mProfExec in Parsl

In [None]:
@python_app
def mProject_parsl(inputs=  [], outputs = []):
    '''
    This is the Parsl Function that executes the mProject on each input image 
    and outputs the FITS file to the Kprojdir directory.
    '''
    import montage_wrapper as montage
    return montage.mProject(inputs[0], outputs[0], inputs[1])

In [14]:
list_of_images = os.listdir(os.path.join(cwd,'Kimages/'))

output = []

for image in list_of_images:
    '''
    For each image, we capture the input image and output image.
    We also feed the template header for each image.
    The inputs and outputs are then fed into the Parsl function
    '''
    input_image = File(os.path.join(cwd, 'Kimages/' + image))
    output_image = File(os.path.join(cwd, 'Kprojdir/hdu0_' + image))
    template = File(os.path.join(cwd,'Ktemplate.hdr'))

    output.append(mProject_parsl(inputs=[input_image, template],
                                 outputs = [output_image]))
    
output = [i.result() for i in output]
    
'''
If the function wasn't run in parallel, it would have looked like this:

montage.mProjExec(File(os.path.join(cwd,'Kimages.tbl')),
                  File(os.path.join(cwd,'Ktemplate.hdr')),
                  os.path.join(cwd,'Kprojdir/'),
                  File(os.path.join(cwd,'stats.tbl')))
'''

Final non-parallel section of the First part of Montage Mosaic

In [15]:
with io.capture_output() as captured2:
    '''
    Packaging all the non-parallel commands inside a captured output to prevent printing any outputs here.
    '''
    montage.mImgtbl(os.path.join(cwd,'Kprojdir/'), File(os.path.join(cwd,'images.tbl')))
    montage.mAdd( File(os.path.join(cwd,'images.tbl')), 
                  File(os.path.join(cwd,'Ktemplate.hdr')), 
                  File(os.path.join(cwd,'m17_uncorrected.fits')))
    !mViewer -ct 1 -gray m17_uncorrected.fits -1s max gaussian-log -out m17_uncorrected.png

'''
The markdown image below pulls the uncorrected image file:  m17_uncorrected.png
'''

![](./images/m17_uncorrected.png)

### Second Part

Initial non-parallel section

In [16]:
with io.capture_output() as captured:
    '''
    Packaging all the non-parallel commands inside a captured output to prevent printing any outputs here.
    '''
    montage.mOverlaps(File(os.path.join(cwd,'images.tbl')), File(os.path.join(cwd,'diffs.tbl')))
    os.mkdir(os.path.join(cwd,'diffdir/'))

Implementation of mDiffExec in Parsl

In [17]:
@python_app
def mDiff_parsl(inputs=[], outputs = []):
    '''
    The Parsl function for evaluating mDiff function over all input images.
    This replaces the mDiffExec function.
    '''

    import montage_wrapper as montage
    return montage.mDiff(inputs[0], inputs[1], outputs[0], inputs[2])

In [18]:
'''
This cell involves essential data processing that is required to 
feed individual images into the Parsl function for mDiff.

We extract the the two images for each file (normal image and _area image).
We laso extract the output image directory.
'''

df = pd.read_csv('diffs.tbl', comment='#', delim_whitespace=True).drop(0)
images1 = list(df['|.1'])
images2 = list(df['cntr2'])
outputs = list(df['|.2'])

In [19]:
outputs_2 = []

for i in range(len(images1)):
    '''
    In the for loop, we extract individual input images along with output_file directory.
    The inputs along with the template header are fed into the mDiff_parsl function.
    '''
    
    image1 = File(os.path.join(cwd,'Kprojdir/' + images1[i]))
    image2 = File(os.path.join(cwd,'Kprojdir/' + images2[i]))
    output_file = File(os.path.join(cwd,'diffdir/' + outputs[i]))
    template = File(os.path.join(cwd,'Ktemplate.hdr'))
    
    outputs_2.append(mDiff_parsl(inputs=[image1, image2, template],
                                 outputs = [output_file]))
    
outputs_2 = [i.result() for i in outputs_2]

'''
If the function wasn't run in parallel, it would have looked like this:

montage.mDiffExec(File(os.path.join(cwd,'diffs.tbl')), 
                  File(os.path.join(cwd,'Ktemplate.hdr')), 
                  os.path.join(cwd,'diffdir/'),
                  proj_dir=os.path.join(cwd,'Kprojdir/'))
'''

Non-parallel components after mDiffExec

In [20]:
with io.capture_output() as captured:
    '''
    Packaging all the non-parallel commands inside a captured output to prevent printing any outputs here.
    '''
    montage.mFitExec(File(os.path.join(cwd,'diffs.tbl')), File(os.path.join(cwd,'fits.tbl')), 
                     os.path.join(cwd,'diffdir/'))
    montage.mBgModel(File(os.path.join(cwd,'images.tbl')), File(os.path.join(cwd,'fits.tbl')), 
                 File(os.path.join(cwd,'corrections.tbl')))
    
    os.mkdir(os.path.join(cwd,'corrdir'))

Implementation of mBgExec in Parsl

In [21]:
'''
This cell involves essential data processing that is required to 
feed individual images into the Parsl function for mBackground.

We extract the correction values for each image along with image id that we'll use for matching each image.
We also get the image table to get the directory of each image
'''


corrections = pd.read_csv('corrections.tbl', comment='|', delim_whitespace=True)
corrections.loc[90] = list(corrections.columns)
corrections.columns = ['id','a','b','c']

for i in range(len(corrections)):
    corrections['id'][i] = int(corrections['id'][i])
    
images_table = pd.read_csv('images.tbl', comment='|', delim_whitespace=True)

In [22]:
@python_app
def mBackground_parsl(inputs=[], outputs = []):
    '''
    The Parsl function for evaluating mBackground function over all input images and correct them.
    This replaces the mBgExec function.
    '''
    import montage_wrapper as montage
    return montage.mBackground( inputs[0], 
                                outputs[0], 
                                inputs[1],
                                inputs[2],
                                inputs[3])

In [24]:
outputs_mb = []

for i in range(len(images_table)):
    '''
    In the for loop, we extract individual input images along with output_image directory.
    The inputs along with the correction values are fed into the mBackground_parsl function.
    '''
    
    input_image = list(images_table['fitshdr'])[i]
    file_name = (list(images_table['fitshdr'])[i]).replace(cwd + '/Kprojdir/', '')
    output_image = os.path.join(cwd + '/corrdir',file_name)
    correction_values = list(corrections.loc[ corrections['id'] == i ].values[0])
    outputs_mb.append(mBackground_parsl(inputs = [File(input_image), correction_values[1], correction_values[2], correction_values[3]],
                        outputs = [File(output_image)]))
    
outputs_mb = [i.result() for i in outputs_mb]

'''
If the function wasn't run in parallel, it would have looked like this:

montage.mBgExec( File(os.path.join(cwd,'images.tbl')), 
                 File(os.path.join(cwd,'corrections.tbl')), 
                 os.path.join(cwd,'corrdir'), 
                 proj_dir=os.path.join(cwd,'Kprojdir'))
'''

Final non-parallel component of the Montage Mosaic

In [25]:
with io.capture_output() as captured:
    '''
    Packaging all the non-parallel commands inside a captured output to prevent printing any outputs here.
    '''
    montage.mAdd(File(os.path.join(cwd,'images.tbl')), 
             File(os.path.join(cwd,'Ktemplate.hdr')), 
             File(os.path.join(cwd,'m17.fits')))
    !mViewer -ct 1 -gray m17.fits -1s max gaussian-log -out m17.png

![](./images/m17.png)