In [105]:
from elasticsearch import Elasticsearch
import logging
import json
import time
from string import Template

In [106]:
ES_LOGIN = 'login'
ES_PASSWORD = 'password'
ES_PROTOCOL = 'http'
ES_HOST = 'bamboo.zoo.computing.kiae.ru'
ES_PORT = 9200
ES_INDEX = 'supp-notes'

In [107]:
logging.basicConfig(level=logging.ERROR)

es_url = '{protocol}://{login}:{password}@{host}:{port}'.format(protocol=ES_PROTOCOL,
                                                                login=ES_LOGIN,
                                                                password=ES_PASSWORD,
                                                                host=ES_HOST,
                                                                port=ES_PORT)
logging.debug("es_url: {}".format(es_url))

es = Elasticsearch([es_url])

DEBUG:root:es_url: http://login:password@bamboo.zoo.computing.kiae.ru:9200


In [108]:
def search(query, index=ES_INDEX):
    start = time.time()
    result = es.search(index=index, body=query)
    duration = time.time() - start
    logging.debug(f"Query took {duration:.3f} seconds.")
    return result


def search_all_indices(query):
    start = time.time()
    result = es.search(body=query)
    duration = time.time() - start
    logging.debug(f"Query took {duration:.3f} seconds.")
    return result

In [109]:
def nested_query1():
    """
    В  индексе supp-notes возьми _id = CDS_CERN-ATL-COM-PHYS-2017-1110
    В этом документе есть датасеты в поле PDFAnalyzer.mc_datasets
    Нужно найти эти датасеты в индексе prodsys_rucio_ami (тип данных: output_dataset)
    и выдать все их свойства.
    :return: Query results
    """
    q1 = """{
    "query": {
      "bool": {
        "must": [
          {
            "term": {
              "_id": "CDS_CERN-ATL-COM-PHYS-2017-1110"
            }
          }
          ]
      }
    }
    }"""
    
    q2 = """{
    "query": {
      "bool": {
        "must": [
          {
            "match_phrase": {
              "output": "$dataset"
            }
          }
          ]
      }
    }
    }"""
    q2_template = Template(q2)
    
    result1 = search(q1, index='supp-notes')
    mc_dataset_names = result1['hits']['hits'][0]['_source']['PDFAnalyzer']['mc_datasets']
    
    mc_datasets = []
    for mc_dataset in mc_dataset_names:
        mc_datasets.append(search(q2_template.substitute(dataset=mc_dataset) , index='prodsys_rucio_ami'))
        
    return mc_datasets


# nested_query2 нуждается в уточнении задачи.
def nested_query2():
    """
    Возьми любой датасет (type: output_dataset) из индекса prodsys_rucio_ami 
    и выдай по нему данные из type: task, 
    и попробуй найти для него соответствующие документы из индекса supp_notes. 
    """
    q1 = """{
    "query": {
      "bool": {
        "must": [
          {
            "match_phrase": {
              "output": "mc15_13TeV.361341.Sherpa_CT10_Wmunu_Pt700_1000_BFilter.merge.DAOD_HIGG8D1.e4133_s2608_s2183_r7725_r7676_p2949_tid10353756_00"
            }
          }
          ]
      }
    }
}"""
    
    
def nested_query3():
    """
    Возьми любую статью из индекса papers  
    [поле GLANCE.supporting_notes] 
    и верни для нее список всех документов из supp-notes
    """
    q1 = """{
    "query": {
      "bool": {
        "must": [
          {
            "term": {
              "_id": "CDS_ATLAS-HIGG-2016-14-003"
            }
          }
          ]
      }
    }
    }"""
    
    q2 = """{
    "query": {
      "bool": {
        "must": [
          {
            "term": {
              "_id": "$doc"
            }
          }
          ]
      }
    }
    }"""
    q2_template = Template(q2)
    
    result1 = search(q1, index='papers')
    docs_list = result1['hits']['hits'][0]['_source']['GLANCE']['supporting_notes']
    
    notes = []
    for doc in docs_list:
        notes.append(search(q2_template.substitute(doc=doc), index="supp-notes"))
        
    return notes

In [110]:
r = nested_query3()

DEBUG:urllib3.util.retry:Converted retries value: False -> Retry(total=False, connect=None, read=None, redirect=0, status=None)
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): bamboo.zoo.computing.kiae.ru
DEBUG:urllib3.connectionpool:http://bamboo.zoo.computing.kiae.ru:9200 "GET /papers/_search HTTP/1.1" 403 169
DEBUG:elasticsearch:> {
    "query": {
      "bool": {
        "must": [
          {
            "term": {
              "_id": "CDS_ATLAS-HIGG-2016-14-003"
            }
          }
          ]
      }
    }
    }
DEBUG:elasticsearch:< <html>
<head><title>403 Forbidden</title></head>
<body bgcolor="white">
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx/1.10.2</center>
</body>
</html>



AuthorizationException: TransportError(403, '<html>\r\n<head><title>403 Forbidden</title></head>\r\n<body bgcolor="white">\r\n<center><h1>403 Forbidden</h1></center>\r\n<hr><center>nginx/1.10.2</center>\r\n</body>\r\n</html>\r\n')

In [111]:
r

[{'_shards': {'failed': 0, 'successful': 5, 'total': 5},
  'hits': {'hits': [{'_id': 'CDS_CERN-ATL-COM-PHYS-2016-1744',
     '_index': 'supp-notes',
     '_score': 1.0,
     '_source': {'CDS': {'abstract': 'This note describes a search for $Z{\\gamma}$ resonances and for the decays to $Z\\gamma$ of the 125~GeV Higgs boson using events in which the $Z$ boson decays to di-electrons or di-muons. The search uses the 36.5~\\ifb\\ dataset recorded by the ATLAS experiment in $pp$ collisions at the LHC in 2015 and 2016. The resonance search is performed by looking for localised excesses in the invariant mass of the three-body final state $\\ell\\ell\\gamma$, $\\ell=e,\\mu$, over a smoothly-falling background arising from Standard Model processes.',
       'accelerator_experiment': {'accelerator': 'CERN LHC',
        'experiment': 'ATLAS'},
       'base': '90',
       'collaboration': [],
       'collection': ['INTNOTEATLASPRIV', 'InternalNote'],
       'creation_date': '2016-12-01T09:38:01',
 