In [1]:
import elasticsearch
from elasticsearch_dsl import Search

In [2]:
es = elasticsearch.Elasticsearch(['localhost:9200'])

In [3]:
try:
    es.indices.delete('datamart')
except Exception:
    pass
es.indices.create(
    'datamart',
    {
        'mappings': {
            '_doc': {
                'properties': {
                    # dataset -> metadata is a parent -> child relationship
                    'kind': {
                        'type': 'join',
                        'relations': {
                            'dataset': ['metadata', 'feedback'],
                        },
                    },
                    # 'columns' is a nested field, we want to query individual columns
                    'columns': {
                        'type': 'nested',
                        'properties': {
                            'semantic_types': {
                                'type': 'keyword',
                                'index': True,
                            },
                        },
                    },
                },
            },
        },
    },
)

{'acknowledged': True, 'index': 'datamart', 'shards_acknowledged': True}

# Test data

In [4]:
# Discover some datasets
doc1 = es.index(
    'datamart',
    '_doc',
    {
        'kind': 'dataset',
        'discoverer': 'test-jupyter',
        'url': 'http://localhost/data.csv',
        'date': '2018-09-26T18:32:26Z',
    },
)['_id']
doc2 = es.index(
    'datamart',
    '_doc',
    {
        'kind': 'dataset',
        'discoverer': 'test-jupyter',
        'url': 'http://localhost/data2.csv',
        'date': '2018-09-26T19:18:32Z',
    },
)['_id']
doc1, doc2

('kpswGWYBYZwEx0JzH0C_', 'k5swGWYBYZwEx0JzH0DZ')

In [5]:
# Ingest metadata
es.index(
    'datamart',
    '_doc',
    {
        'kind': {'name': 'metadata', 'parent': doc1},
        'ingester': 'test-jupyter',
        'date': '2018-09-26T18:33:56Z',
        'keywords': ['weather', 'united states'],
        'columns': [
            {
                'name': 'lat',
                'type': 'real',
                'semantic_types': ['Real', 'Latitude'],
                'distribution': [
                    {
                        'type': 'range_float',
                        'min_float': 42.98,
                        'max_float': 43.42,
                    },
                    {
                        'type': 'normal',
                        'mean': 43.0,
                        'std_dev': 0.3,
                    },
                ],
            },
            {
                'name': 'long',
                'type': 'real',
                'semantic_types': ['Real', 'Longitude'],
                'distribution': [
                    {
                        'type': 'range_float',
                        'min_float': 0.41,
                        'max_float': 7.02,
                    },
                ],
            },
            {
                'name': 'temp',
                'type': 'real',
                'semantic_types': ['Real', 'Temperature', 'Fahrenheit'],
                'distribution': [
                    {
                        'type': 'range_float',
                        'min_float': 24.74,
                        'max_float': 29.18,
                    },
                ],
            },
        ],
    },
    routing=doc1,
)
es.index(
    'datamart',
    '_doc',
    {
        'kind': {'name': 'metadata', 'parent': doc2},
        'ingester': 'test-jupyter',
        'date': '2018-09-26T19:20:02Z',
        'keywords': ['weather', 'new york'],
        'columns': [
            {
                'name': 'date',
                'type': 'string',
                'semantic_types': ['Date'],
                'distribution': [
                    {
                        'type': 'range_string',
                        'min_string': '2001-01-01T00:01:02Z',
                        'max_string': '2004-12-30T23:57:17Z',
                    },
                ],
            },
            {
                'name': 'temp',
                'type': 'real',
                'semantic_types': ['Real', 'Temperature', 'Fahrenheit'],
                'distribution': [
                    {
                        'type': 'range_float',
                        'min_float': 24.74,
                        'max_float': 29.18,
                    },
                ],
            },
        ],
    },
    routing=doc2,
)

{'_id': 'lZswGWYBYZwEx0JzIEAz',
 '_index': 'datamart',
 '_primary_term': 1,
 '_seq_no': 1,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': '_doc',
 '_version': 1,
 'result': 'created'}

In [6]:
# Feedback from consumer?
es.index(
    'datamart',
    '_doc',
    {
        'kind': {'name': 'feedback', 'parent': doc2},
        'consumer': 'ta2-jupyter',
        'date': '2018-09-26T23:41:21Z',
        'problem': 'org.datadrivendiscovery.185_baseball_problem.1_0',
        'scores': {
            'mean_squared_error': 0.54633,
        },
    },
    routing=doc2,
)

{'_id': 'lpswGWYBYZwEx0JzIEBc',
 '_index': 'datamart',
 '_primary_term': 1,
 '_seq_no': 2,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': '_doc',
 '_version': 1,
 'result': 'created'}

# Search

In [7]:
# Get a dataset
es.get('datamart', '_doc', id=doc1)['_source']

{'date': '2018-09-26T18:32:26Z',
 'discoverer': 'test-jupyter',
 'kind': 'dataset',
 'url': 'http://localhost/data.csv'}

In [8]:
# Search for datasets
s = (
    Search(using=es, index='datamart', doc_type='_doc')
    .query('match', kind='dataset')
    .query('match', discoverer='test-jupyter')
)

list(s.execute())

[]

In [9]:
# Search for datasets via metadata
es.search(
    index='datamart',
    body={
        'query': {
            'bool': {
                'must': [
                    {
                        'has_child': {
                            'type': 'metadata',
                            # Metadata with specific keywords
                            'query': {
                                'term': {
                                    'keywords': 'weather',
                                },
                            },
                        },
                    },
                    {
                        'has_child': {
                            'type': 'metadata',
                            'query': {
                                'nested': {
                                    'path': 'columns',
                                    # Metadata with specific columns
                                    'query': {
                                        'terms': {'columns.semantic_types': ['Latitude']},
                                    },
                                },
                            },
                        },
                    },
                ],
            },
        },
    },
)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [], 'max_score': None, 'total': 0},
 'timed_out': False,
 'took': 19}