PanDAWMS · mgolosova · May 28, 2019 · Mar 27, 2019 · Mar 29, 2019 · Apr 2, 2019
diff --git a/Utils/Dataflow/009_oracleConnector/README b/Utils/Dataflow/009_oracleConnector/README
@@ -10,6 +10,8 @@ them as NDJSON.
 Currently works with specific set of queries only:
  * prodsys2ES + datasets
  * prodsys2ES
+ * consistency : simplified query that only obtains taskid and task_timestamp
+   for each task
 
 The goal is to make it work with any number and combination of queries.
 

diff --git a/Utils/Dataflow/009_oracleConnector/query/consistency.sql b/Utils/Dataflow/009_oracleConnector/query/consistency.sql
@@ -0,0 +1,30 @@
+-- Select all tasks for specified period of time
+-- Query tables:
+------------------------------------------------
+-- ATLAS_DEFT.t_production_task
+-- ATLAS_DEFT.t_production_step
+-- ATLAS_DEFT.t_step_template
+-- ATLAS_DEFT.t_ht_to_task
+-- ATLAS_DEFT.t_hashtag
+-- ATLAS_PANDA.jedi_datasets
+--
+-- All fields:
+-- architecture, campaign, cloud, conditions_tags, core_count, description, end_time,
+-- energy_gev, evgen_job_opts, geometry_version, hashtag_list, job_config, physics_list, processed_events,
+-- phys_group, project, pr_id, requested_events, run_number, site, start_time, step_name, status, subcampaign,
+-- taskid, taskname, task_timestamp,  ticket_id, trans_home, trans_path, trans_uses, trigger_config, user_name, vo,
+-- n_files_per_job, n_events_per_job, n_files_to_be_used,
+
+-- RESTRICTIONS:
+-- 1. taskID must be more than 4 000 000 OR from the date > 12-03-2014
+-- 2. we collecting only PRODUCTION tasks OR only ANALYSIS tasks
+--    ('pr_id > 300' or 'pr_id = 300')
+  SELECT DISTINCT
+    t.taskid,
+    TO_CHAR(t.timestamp, 'dd-mm-yyyy hh24:mi:ss')           AS task_timestamp
+  FROM
+    ATLAS_DEFT.t_production_task t
+  WHERE
+    t.timestamp > :start_date AND
+    t.timestamp <= :end_date AND
+    t.pr_id %(production_or_analysis_cond)s 300
diff --git a/Utils/Dataflow/069_upload2es/README b/Utils/Dataflow/069_upload2es/README
@@ -4,7 +4,7 @@
 
 Description
 -----------
-Uploads prepared data to ElasticSearch.
+load_data.sh uploads prepared data to ElasticSearch.
 
 Input
 -----
@@ -18,6 +18,24 @@ JSON documents, one per line:
 ...
 }}}
 
+Consistency
+-----------
+consistency.py checks that the data is present in ElasticSearch instead of
+uploading it. Input comes from Stage 009(in consistency mode) and only needs 2
+fields for now:
+{{{
+{taskid, task_timestamp}
+...
+}}}
+
+Consistency check can be run as following:
+
+  ./consistency.py --conf elasticsearch_config
+
+For more information about running the check and its arguments, use:
+
+  ./consistency.py -h
+
 TODO
 ----
 Make the stage aware of EOProcess/EOMessage markers
diff --git a/Utils/Dataflow/069_upload2es/consistency.py b/Utils/Dataflow/069_upload2es/consistency.py
@@ -0,0 +1,199 @@
+#!/bin/env python
+'''
+Script for checking the supplied task's presence in elasticsearch.
+
+Currently it performs the check by comparing the supplied timestamp
+with the one in elasticsearch.
+
+Authors:
+  Vasilii Aulov (vasilii.aulov@cern.ch)
+'''
+import os
+import sys
+import traceback
+
+from datetime import datetime
+
+import elasticsearch
+
+
+def log(msg, prefix='DEBUG'):
+    ''' Add prefix and current time to message and write it to stderr. '''
+    prefix = '(%s)' % (prefix)
+    prefix = prefix.ljust(8)
+    sys.stderr.write('%s%s %s\n' % (prefix, datetime.now().isoformat(), msg))
+
+
+try:
+    base_dir = os.path.dirname(__file__)
+    dkb_dir = os.path.join(base_dir, os.pardir)
+    sys.path.append(dkb_dir)
+    import pyDKB
+    from pyDKB.dataflow.stage import JSONProcessorStage
+    from pyDKB.dataflow.messages import JSONMessage
+    from pyDKB.dataflow.exceptions import DataflowException
+except Exception, err:
+    log('Failed to import pyDKB library: %s' % err, 'ERROR')
+    sys.exit(1)
+
+
+es = None
+
+
+INDEX = None
+
+
+def load_config(fname):
+    ''' Open elasticsearch config and obtain parameters from it.
+
+    Setup INDEX as global variable.
+
+    :param fname: config file's name
+    :type fname: str
+    '''
+    cfg = {
+        'ES_HOST': 'localhost',
+        'ES_PORT': '9200',
+        'ES_USER': '',
+        'ES_PASSWORD': '',
+        'ES_INDEX': ''
+    }
+    with open(fname) as f:
+        lines = f.readlines()
+    for l in lines:
+        if l.startswith('ES'):
+            key = False
+            value = False
+            try:
+                (key, value) = l.split()[0].split('=')
+            except ValueError:
+                pass
+            if key in cfg:
+                cfg[key] = value
+    global INDEX
+    INDEX = cfg['ES_INDEX']
+    return cfg
+
+
+def es_connect(cfg):
+    ''' Establish a connection to elasticsearch, as a global variable.
+
+    :param cfg: connection parameters
+    :type cfg: dict
+    '''
+    global es
+    if cfg['ES_USER'] and cfg['ES_PASSWORD']:
+        s = 'http://%s:%s@%s:%s/' % (cfg['ES_USER'],
+                                     cfg['ES_PASSWORD'],
+                                     cfg['ES_HOST'],
+                                     cfg['ES_PORT'])
+    else:
+        s = '%s:%s' % (cfg['ES_HOST'], cfg['ES_PORT'])
+    es = elasticsearch.Elasticsearch([s])
+
+
+def get_field(index, taskid, field):
+    ''' Get field value by given taskid.
+
+    :param es: elasticsearch client
+    :type es: elasticsearch.client.Elasticsearch
+    :param index: index containing tasks
+    :type index: str
+    :param taskid: taskid of the task to look for
+    :type taskid: int or str
+    :param index: field name
+    :type index: str
+
+    :return: field value, or False if the task was not found
+    :rtype: int or bool
+    '''
+    try:
+        results = es.get(index=index, doc_type='_all', id=taskid,
+                         _source=[field])
+    except elasticsearch.exceptions.NotFoundError:
+        return False
+    return results['_source'].get(field)
+
+
+def process(stage, message):
+    ''' Process a message.
+
+    Implementation of :py:meth:`.AbstractProcessorStage.process` for hooking
+    the stage into DKB workflow.
+
+    :param stage: stage instance
+    :type stage: pyDKB.dataflow.stage.ProcessorStage
+    :param msg: input message with task info
+    :type msg: pyDKB.dataflow.Message
+    '''
+    data = message.content()
+    if type(data) is not dict:
+        log('Incorrect data:' + str(data), 'INPUT')
+        return False
+    taskid = data.get('taskid')
+    if taskid is None:
+        log('No taskid in data:' + str(data), 'INPUT')
+        return False
+    timestamp = data.get('task_timestamp')
+    if timestamp is None:
+        log('No timestamp supplied for taskid ' + str(taskid), 'INPUT')
+        return False
+
+    es_timestamp = get_field(INDEX, taskid, 'task_timestamp')
+    if es_timestamp is None:
+        log('No timestamp in ES for taskid ' + str(taskid), 'DIFF')
+    elif not es_timestamp:
+        log('Taskid %d not found in ES' % taskid, 'DIFF')
+    elif es_timestamp != timestamp:
+        log('Taskid %d has timestamp %s in ES, %s in Oracle' % (taskid,
+                                                                es_timestamp,
+                                                                timestamp),
+            'DIFF')
+    else:
+        log('Taskid %d is up to date in ES' % taskid, 'INFO')
+
+    return True
+
+
+def main(args):
+    ''' Parse command line arguments and run the stage.
+
+    :param argv: arguments
+    :type argv: list
+    '''
+
+    stage = JSONProcessorStage()
+    stage.add_argument('--conf', help='elasticsearch config', required=True)
+
+    exit_code = 0
+    exc_info = None
+    try:
+        stage.parse_args(args)
+        cfg = load_config(stage.ARGS.conf)
+        stage.process = process
+        es_connect(cfg)
+        if not es.indices.exists(INDEX):
+            log('No such index: %s' % INDEX, 'ERROR')
+            exit_code = 4
+        else:
+            stage.run()
+    except (DataflowException, RuntimeError), err:
+        if str(err):
+            log(err, 'ERROR')
+        exit_code = 2
+    except Exception:
+        exc_info = sys.exc_info()
+        exit_code = 3
+    finally:
+        stage.stop()
+
+    if exc_info:
+        trace = traceback.format_exception(*exc_info)
+        for line in trace:
+            log(line, 'ERROR')
+
+    exit(exit_code)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])