Skip to content

Commit

Permalink
Merge pull request #102 (Stage 093_datasetsFormat)
Browse files Browse the repository at this point in the history
Added 093_datasetsFormat stage

The main purpose of this stage is to add dataset format parameter
to the JSON string of each dataset.
  • Loading branch information
mgolosova committed Feb 2, 2018
2 parents 9fcb524 + f0b7620 commit 8c5ca49
Show file tree
Hide file tree
Showing 6 changed files with 280 additions and 0 deletions.
18 changes: 18 additions & 0 deletions Utils/Dataflow/093_datasetsFormat/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
=============
* Stage 093 *
=============

1. Description
--------------
Update dataset metadata:
* add "data_format" field (field value type: list)


2. Input
--------
Comes from stage 091.


3. Output
---------
Same as input, with additional field(s).
79 changes: 79 additions & 0 deletions Utils/Dataflow/093_datasetsFormat/datasets_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/env python
"""
Add 'data_format' field, extracted from datasetname
"""

import sys
import os
import re

base_dir = os.path.abspath(os.path.dirname(__file__))

try:
dkb_dir = os.path.join(base_dir, os.pardir)
sys.path.append(dkb_dir)
import pyDKB
except Exception, err:
sys.stderr.write("(ERROR) Failed to import pyDKB library: %s\n" % err)
sys.exit(1)


def main(argv):
""" Program body. """
stage = pyDKB.dataflow.stage.JSONProcessorStage()
exit_code = 0
try:
stage.parse_args(argv)
stage.process = process
stage.run()
except (pyDKB.dataflow.exceptions.DataflowException, RuntimeError), err:
if str(err):
str_err = str(err).replace("\n", "\n(==) ")
sys.stderr.write("(ERROR) %s\n" % str_err)
exit_code = 2
finally:
stage.stop()

sys.exit(exit_code)


def process(stage, message):
""" Process input message.
"""
msg = message.content()
msg["data_format"] = dataset_format(msg.get('datasetname'))
stage.output(pyDKB.dataflow.messages.JSONMessage(msg))

return True


def dataset_format(datasetname):
"""
Extract data format from datasetname
According to dataset naming nomenclature:
https://dune.bnl.gov/w/images/9/9e/Gen-int-2007-001_%28NOMENCLATURE%29.pdf
for MC datasets:
mcNN_subProject.datasetNumber.physicsShort.prodStep.dataType.Version
for Real Data:
DataNN_subProject.runNumber.streamName.prodStep.dataType.Version
In both cases the dataType filed is required.
In case of complex data formats, like 'DAOD_SUSY5',
the field is splitted by '_' and returns it's full name
and first part ('DAOD'), defining the general name of the data format.
:param datasetname:
:return: list
"""
if not datasetname:
return None
ds_format = datasetname.split('.')[4]
if re.match("\w+_\w+", ds_format) is not None:
result = [ds_format, ds_format.split('_')[0]]
else:
result = [ds_format]
return result


if __name__ == '__main__':
main(sys.argv[1:])
1 change: 1 addition & 0 deletions Utils/Dataflow/093_datasetsFormat/input
Loading

0 comments on commit 8c5ca49

Please sign in to comment.