In [7]:
import sys
mitie_path = '/home/tsdaemon/sources/MITIE/'
sys.path.append(mitie_path + 'mitielib')


from mitie import *
from collections import defaultdict

In [8]:
print "loading NER model..."
ner = named_entity_extractor(mitie_path + 'MITIE-models/english/ner_model.dat')
print "\nTags output by this NER model:", ner.get_possible_ner_tags()

# Load a text file and convert it into a list of words.  
tokens = tokenize(load_entire_file(mitie_path + 'sample_text.txt'))
print "Tokenized input:", tokens

loading NER model...

Tags output by this NER model: ['PERSON', 'LOCATION', 'ORGANIZATION', 'MISC']
Tokenized input: ['A', 'Pegasus', 'Airlines', 'plane', 'landed', 'at', 'an', 'Istanbul', 'airport', 'Friday', 'after', 'a', 'passenger', '"', 'said', 'that', 'there', 'was', 'a', 'bomb', 'on', 'board', '"', 'and', 'wanted', 'the', 'plane', 'to', 'land', 'in', 'Sochi', ',', 'Russia', ',', 'the', 'site', 'of', 'the', 'Winter', 'Olympics', ',', 'said', 'officials', 'with', 'Turkey', "'s", 'Transportation', 'Ministry', '.', 'Meredith', 'Vieira', 'will', 'become', 'the', 'first', 'woman', 'to', 'host', 'Olympics', 'primetime', 'coverage', 'on', 'her', 'own', 'when', 'she', 'fills', 'on', 'Friday', 'night', 'for', 'the', 'ailing', 'Bob', 'Costas', ',', 'who', 'is', 'battling', 'a', 'continuing', 'eye', 'infection', '.', '"', 'It', "'s", 'an', 'honor', 'to', 'fill', 'in', 'for', 'him', ',', '"', 'Vieira', 'said', 'on', 'TODAY', 'Friday', '.', '"', 'You', 'think', 'about', 'the', 'Olympics', ','

In [9]:
entities = ner.extract_entities(tokens)
print "\nEntities found:", entities
print "\nNumber of entities detected:", len(entities)


Entities found: [(xrange(1, 3), 'ORGANIZATION', 0.5662716633696536), (xrange(7, 8), 'LOCATION', 1.3961989551275185), (xrange(30, 31), 'LOCATION', 1.5617225956091672), (xrange(32, 33), 'LOCATION', 1.2932414954225302), (xrange(44, 45), 'LOCATION', 1.1356116407121106), (xrange(46, 48), 'ORGANIZATION', 0.9876062161909271), (xrange(49, 51), 'PERSON', 1.0676552859044377), (xrange(58, 59), 'MISC', 1.190955875245744), (xrange(73, 75), 'PERSON', 0.8800389619396217), (xrange(96, 97), 'PERSON', 1.2623335607783588), (xrange(107, 108), 'MISC', 0.7035980014268862), (xrange(116, 118), 'PERSON', 0.9773891906703803), (xrange(121, 122), 'PERSON', 1.3240247462100005), (xrange(139, 141), 'ORGANIZATION', 0.73640824406934), (xrange(143, 145), 'PERSON', 1.579659392025735), (xrange(150, 151), 'LOCATION', 0.9852057723609379), (xrange(159, 161), 'PERSON', 1.4906579997085982), (xrange(164, 166), 'PERSON', 1.5127579847728057), (xrange(169, 170), 'LOCATION', 1.3205684671616802), (xrange(187, 188), 'LOCATION', 0.7

In [10]:
# entities is a list of tuples, each containing an xrange that indicates which
# tokens are part of the entity, the entity tag, and an associate score.  The
# entities are also listed in the order they appear in the input text file.
# Here we just print the score, tag, and text for each entity to the screen.
# The larger the score the more confident MITIE is in its prediction.
for e in entities:
    range = e[0]
    tag = e[1]
    score = e[2]
    score_text = "{:0.3f}".format(score)
    entity_text = " ".join(tokens[i] for i in range)
    print "   Score: " + score_text + ": " + tag + ": " + entity_text

   Score: 0.566: ORGANIZATION: Pegasus Airlines
   Score: 1.396: LOCATION: Istanbul
   Score: 1.562: LOCATION: Sochi
   Score: 1.293: LOCATION: Russia
   Score: 1.136: LOCATION: Turkey
   Score: 0.988: ORGANIZATION: Transportation Ministry
   Score: 1.068: PERSON: Meredith Vieira
   Score: 1.191: MISC: Olympics
   Score: 0.880: PERSON: Bob Costas
   Score: 1.262: PERSON: Vieira
   Score: 0.704: MISC: Olympics
   Score: 0.977: PERSON: Bob Costas
   Score: 1.324: PERSON: Bob
   Score: 0.736: ORGANIZATION: NBC Olympics
   Score: 1.580: PERSON: Jim Bell
   Score: 0.985: LOCATION: Sochi
   Score: 1.491: PERSON: Josiah Franklin
   Score: 1.513: PERSON: Benjamin Franklin
   Score: 1.321: LOCATION: Boston
   Score: 0.766: LOCATION: Philadelphia
   Score: 0.978: PERSON: Benjamin Franklin
   Score: 1.205: PERSON: Benjamin Franklin
   Score: 1.143: LOCATION: Boston


In [12]:
# Now let's run one of MITIE's binary relation detectors.  MITIE comes with a
# bunch of different types of relation detector and includes tools allowing you
# to train new detectors.  However, here we simply use one, the "person born in
# place" relation detector.
rel_detector = binary_relation_detector(mitie_path + "MITIE-models/english/binary_relations/rel_classifier_people.person.place_of_birth.svm")

# First, let's make a list of neighboring entities.  Once we have this list we
# will ask the relation detector if any of these entity pairs is an example of
# the "person born in place" relation. 
neighboring_entities = [(entities[i][0], entities[i+1][0]) for i in xrange(len(entities)-1)]

In [13]:
# Also swap the entities and add those in as well.  We do this because "person
# born in place" mentions can appear in the text in as "place is birthplace of
# person".  So we must consider both possible orderings of the arguments.
neighboring_entities += [(r,l) for (l,r) in neighboring_entities]

# Now that we have our list, let's check each entity pair and see which one the
# detector selects.
for person, place in neighboring_entities:
    # Detection has two steps in MITIE. First, you convert a pair of entities
    # into a special representation.
    rel = ner.extract_binary_relation(tokens, person, place)
    # Then you ask the detector to classify that pair of entities.  If the
    # score value is > 0 then it is saying that it has found a relation.  The
    # larger the score the more confident it is.  Finally, the reason we do
    # detection in two parts is so you can reuse the intermediate rel in many
    # calls to different relation detectors without needing to redo the
    # processing done in extract_binary_relation().
    score = rel_detector(rel)
    # Print out any matching relations.
    if (score > 0):
        person_text     = " ".join(tokens[i] for i in person)
        birthplace_text = " ".join(tokens[i] for i in place)
        print person_text, "BORN_IN", birthplace_text

Benjamin Franklin BORN_IN Boston
Benjamin Franklin BORN_IN Boston
Benjamin Franklin BORN_IN Philadelphia


In [14]:
# The code above shows the basic details of MITIE's relation detection API.
# However, it is important to note that real world data is noisy any confusing.
# Not all detected relations will be correct.  Therefore, it's important to
# aggregate many relation detections together to get the best signal out of
# your data.  A good way to do this is to pick an entity you are in interested
# in (e.g. Benjamin Franklin) and then find all the relations that mention him
# and order them by most frequent to least frequent.  We show how to do this in
# the code below.
query = "Benjamin Franklin"
hits = defaultdict(int)

for person, place in neighboring_entities:
    rel = ner.extract_binary_relation(tokens, person, place)
    score = rel_detector(rel)
    if (score > 0):
        person_text     = " ".join(tokens[i] for i in person)
        birthplace_text = " ".join(tokens[i] for i in place)
        if (person_text == query):
            hits[birthplace_text] += 1

print "\nTop most common relations:"
for place, count in sorted(hits.iteritems(), key=lambda x:x[1], reverse=True):
    print count, "relations claiming", query, "was born in", place


Top most common relations:
2 relations claiming Benjamin Franklin was born in Boston
1 relations claiming Benjamin Franklin was born in Philadelphia
