# Entity ElasticSearch Index

Here we build up an ElasticSearch cluster for linking to the FB5M knowledge base.

In [2]:
FB5M = '../../data/simple_qa/freebase-FB5M.txt'

# DOWNLOADED FROM: https://www.dropbox.com/s/yqbesl07hsw297w/FB5M.name.txt
MID_TO_NAME = '../../data/simple_qa/FB5M.name.txt'

Below we build an in memory representation of FB5M from the objects prespective.

In [3]:
from collections import defaultdict

import random
import os
import pprint

from tqdm import tqdm_notebook

object_to_fact = defaultdict(list)
for line in tqdm_notebook(open(FB5M, 'r'), total=12010500):
    split = line.split('\t')
    assert len(split) == 3, 'Malformed row'
    object_ = split[0].replace('www.freebase.com/m/', '').strip()
    property_ = split[1].replace('www.freebase.com/', '').strip()
    subjects = [url.replace('www.freebase.com/m/', '').strip() for url in split[2].split()]
    object_to_fact[object_].append({'property': property_, 'subjects': subjects })

pp = pprint.PrettyPrinter(indent=2)
print()
print('Number of Objects:', len(object_to_fact))
print('Sample:', pp.pformat(random.sample(object_to_fact.items(), 5)))



Number of Objects: 1972702
Sample: [ ( '0f_3rwm',
    [ {'property': 'common/topic/notable_types', 'subjects': ['0kpv11']},
      {'property': 'music/recording/artist', 'subjects': ['01799y']},
      {'property': 'music/recording/canonical_version', 'subjects': ['0rrh_v']},
      {'property': 'music/recording/tracks', 'subjects': ['0mwmnzs']},
      {'property': 'music/recording/releases', 'subjects': ['0g0y_c3']}]),
  ( '0bb7hzj',
    [ {'property': 'music/composition/form', 'subjects': ['074ft']},
      {'property': 'music/composition/lyricist', 'subjects': ['01vv7sc']},
      {'property': 'music/composition/language', 'subjects': ['02h40lc']},
      {'property': 'music/composition/composer', 'subjects': ['01vv7sc']}]),
  ( '0c6b6gg',
    [ {'property': 'people/person/nationality', 'subjects': ['0345h']},
      {'property': 'people/person/gender', 'subjects': ['05zppz']},
      {'property': 'people/person/place_of_birth', 'subjects': ['08966']},
      {'property': 'common/topic/not

Below build a map to translate from MID to name. For each MID, we have multiple aliases that'll be used for entity linking.

In [4]:
mid_to_name = defaultdict(list)
for line in tqdm_notebook(open(MID_TO_NAME), total=5507279):
    split = line.strip().split('\t')
    mid = split[0].replace('<fb:m.', '').replace('>', '')
    name = split[2].replace('"', '').replace("'", '')
    mid_to_name[mid].append(name)
print()
print('Number of entries:', len(mid_to_name))
print('Sample:', pp.pformat(random.sample(mid_to_name.items(), 10)))



Number of entries: 3972329
Sample: [ ('0nbbn_1', ['anywhere but here']),
  ('0898tx', ['kori bustard']),
  ('01bph2', ['jim mitchell']),
  ('08hj4n2', ['against timarchos']),
  ( '02lgcwr',
    [ 'cytoskeletal-associated protein tyrosine phosphatase',
      'protein tyrosine phosphatase , non-receptor type 14']),
  ( '035t1xr',
    [ 'the best of the durutti column ( disc 2 )',
      'the best of the durutti column']),
  ('0gdysjq', ['get down ( rappapella ) ( feat. stepz )']),
  ('0xhky38', ['scion cd sampler , volume 26 : the toxic avenger']),
  ('0z7yx20', ['ryan deacon']),
  ('0cbv3', ['richard doyly carte'])]


Lets check if our elasticsearch cluster is healthy; afterwards, we'll attempt to populate it with data.

In [5]:
from elasticsearch_dsl.connections import connections

# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost'])

# Display cluster health
print('Health: %s', connections.get_connection().cluster.health())

Health: %s {'relocating_shards': 0, 'number_of_nodes': 1, 'unassigned_shards': 10, 'status': 'yellow', 'number_of_data_nodes': 1, 'cluster_name': 'elasticsearch', 'active_shards': 10, 'active_primary_shards': 10, 'timed_out': False, 'initializing_shards': 0, 'active_shards_percent_as_number': 50.0, 'number_of_in_flight_fetch': 0, 'number_of_pending_tasks': 0, 'task_max_waiting_in_queue_millis': 0, 'delayed_unassigned_shards': 0}


In [6]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

client = Elasticsearch()
ENTITY_INDEX = 'fb5m_entities'

# check if last entity exists, then do not refetch
num_entities = 0
if client.indices.exists(index=ENTITY_INDEX):
    search = Search(using=client, index=ENTITY_INDEX)
    query = search.query("match_all")
    num_entities = query.count()
    print('Found %d documents in index "%s"' % (query.count(), ENTITY_INDEX))
else:
    print('%s index does not exist' % ENTITY_INDEX)

Found 1133800 documents in index "fb5m_entities"


In [7]:
from elasticsearch_dsl import DocType
from elasticsearch_dsl import Index
from elasticsearch_dsl import Integer
from elasticsearch_dsl import Nested
from elasticsearch_dsl import Search
from elasticsearch_dsl import String

class FreebaseEntity(DocType):
    mid = String(index='not_analyzed')
    names = Nested(required=True, properties={'name': String()})
    facts = Nested(required=True,
                   properties={'subjects': String(index='not_analyzed'),
                               'property': String(index='not_analyzed')})

    class Meta:
        index = ENTITY_INDEX

    def save(self, **kwargs):
        return super().save(**kwargs)

We checked if fb_5m exists, we checked that the cluster is healthy, and we setup a FreebaseEntity document schema. The last step in this process is to populate the cluster.

In [1]:
from datetime import datetime

import os

from elasticsearch.helpers import streaming_bulk

# save entities to elastic search
def get_entities():
    for mid, names in mid_to_name.items():
        if mid in object_to_fact:
            yield {
                'mid': mid,
                'names': names,
                'facts': object_to_fact[mid]
            }
#         else:
#             print('Lost MID:', mid, names)
    

def serialize_entity(mid, names, facts):
    """ serialize the instance into a dictionary so that it can be saved in elasticsearch. """
    names = [{'name': name} for name in names]
    return FreebaseEntity(
        mid=mid,
        facts=facts,
        names=names,
        meta={'id': mid}).to_dict(True)


def save_entities():
    """ efficiently save entities in bulk using `streaming_bulk` and `serialize_entity` """
    elasticsearch_connection = connections.get_connection()
    task_generator = (serialize_entity(**kwargs) for kwargs in get_entities())
    steaming_iterator = tqdm_notebook(streaming_bulk(elasticsearch_connection, task_generator,
                                            chunk_size=100, request_timeout=120))
    for ok, item in steaming_iterator:
        if not ok:
            print(item)


# save entities if not already saved
def create_index():
    input_ = input('WARNING - Delete %d entities? [YES/no] ' % num_entities)
    if input_ == 'YES':
        client.indices.delete(index=ENTITY_INDEX, ignore=[400, 404])
        # create the mappings in elasticsearch
        FreebaseEntity.init()
        save_entities()
    else:
        print('Not Deleting Index! Wohoo!')
    print('Done!')


create_index()
connection.close()

NameError: name 'num_entities' is not defined