In [91]:
import convex as cx

In [46]:
from hdt import HDTDocument
wikidata = HDTDocument("data/kb/wikidata2018_09_11.hdt")

In [7]:
questions = [ 
    "Which actor voiced the Unicorn in The Last Unicorn?",
    "And Alan Arkin was behind...?",
    "Who is the composer of the soundtrack?",
    "So who performed the songs?",
    "Genre of this band's music?",
    "By the way, who was the director?"
            ]

In [6]:
cx.string.create_question_words_list(questions[0])

['actor', 'voiced', 'unicorn', 'last', 'unicorn']

In [4]:
import spacy
spacy.require_gpu()
nlp = spacy.load("en_core_web_lg")

In [2]:
import spacy

nlp = spacy.load("en_core_web_lg")

token.text: Apple Apple PROPN NNP nsubj Xxxxx True False
token.text: is be AUX VBZ aux xx True True
token.text: looking look VERB VBG ROOT xxxx True False
token.text: at at ADP IN prep xx True True
token.text: buying buy VERB VBG pcomp xxxx True False
token.text: U.K. U.K. PROPN NNP compound X.X. False False
token.text: startup startup NOUN NN dobj xxxx True False
token.text: for for ADP IN prep xxx True True
token.text: $ $ SYM $ quantmod $ False False
token.text: 1 1 NUM CD compound d False False
token.text: billion billion NUM CD pobj xxxx True False


In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [3]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [5]:
from spacy import displacy
displacy.serve(doc, style="dep")

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [11]:
doc = nlp(questions[0])

In [9]:
from spacy import displacy
displacy.serve(doc, style="dep")

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [20]:
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

doc = nlp(text)
displacy.render(doc, style="dep", jupyter=True)
displacy.render(doc, style="ent", jupyter=True)

In [17]:

displacy.serve(doc, style="ent")

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [21]:
doc = nlp(questions[0])
[(ent.text, ent.label_) for ent in doc.ents]

[]

In [22]:
[chunk.text for chunk in doc.noun_chunks]

['Which actor', 'the Unicorn', 'The Last Unicorn']

In [45]:
for question in questions:
    print(question)
    doc = nlp(question)
    print([chunk.text for chunk in doc.noun_chunks])
    print([(token.text,token.head, token.tag_) for token in doc],"\n")

Which actor voiced the Unicorn in The Last Unicorn?
['Which actor', 'the Unicorn', 'The Last Unicorn']


AttributeError: 'spacy.tokens.span.Span' object has no attribute 'head'

In [29]:
doc = nlp(questions[0])
[(e.text, e.label_, e.kb_id_) for e in doc.ents]

[]

In [30]:
doc

Which actor voiced the Unicorn in The Last Unicorn?

In [31]:
print([(token.text, token.tag_) for token in doc])

[('Which', 'WDT'), ('actor', 'NN'), ('voiced', 'VBD'), ('the', 'DT'), ('Unicorn', 'NNP'), ('in', 'IN'), ('The', 'DT'), ('Last', 'JJ'), ('Unicorn', 'NNP'), ('?', '.')]


In [42]:
print([(token.text,token.head, token.tag_) for token in doc])
token.subtree

[(actor, 'WDT'), (voiced, 'NN'), (voiced, 'VBD'), (Unicorn, 'DT'), (voiced, 'NNP'), (voiced, 'IN'), (Unicorn, 'DT'), (Unicorn, 'JJ'), (in, 'NNP'), (voiced, '.')]


In [47]:
# Display some metadata about the HDT document itself
print("nb triples: %i" % wikidata.total_triples)
print("nb subjects: %i" % wikidata.nb_subjects)
print("nb predicates: %i" % wikidata.nb_predicates)
print("nb objects: %i" % wikidata.nb_objects)
print("nb shared subject-object: %i" % wikidata.nb_shared)

nb triples: 2935160017
nb subjects: 760717318
nb predicates: 23387
nb objects: 1190479283
nb shared subject-object: 643051538


In [48]:
entity = "http://www.wikidata.org/entity/"+"Q967268"
triples, cardinality = wikidata.search_triples(entity, "", "")

In [49]:
triples

<Iterator {http://www.wikidata.org/entity/Q967268 ?p ?o}>

In [50]:
cardinality

109

In [87]:
wikidata_id = "Q967268"
wikidata_url = "http://www.wikidata.org/entity/"+wikidata_id
triples, cardinality = wikidata.search_triples(wikidata_url, "http://schema.org/name", "")

In [88]:
triples

<Iterator {http://www.wikidata.org/entity/Q967268 http://schema.org/name ?o}>

In [89]:
cardinality

15

In [90]:
for triple in triples:
    #print(triple)
    obj = triple[2]
    #print(obj)
    if "@en" in obj:
        #print(obj)
        label = obj.split('"@en')[0].replace("\"", "")
        #label_dict[wikidata_id] = label
        print(label)

The Last Unicorn


In [95]:
cx.wd.top_k_limit = 5
cx.wd.name_to_wikidata_ids("The Last Unicorn")

['Q176198', 'Q967268', 'Q7746145', 'Q30060419', 'Q15628943']

In [93]:
cx.wd.wikidata_id_to_label("Q176198")

'The Last Unicorn'

In [96]:
cx.wd.get_all_statements_with_qualifier_as_subject("Q176198")

[]

In [99]:
cx.wd.get_statement_with_qualifier_as_object(wikidata_id)

False

In [98]:
cx.wd.is_entity_or_literal(wikidata_id)

True

In [100]:
cx.wd.get_all_statements_of_entity(wikidata_id)

[{'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P123'},
  'object': {'id': 'Q921536'},
  'qualifiers': []},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P1274'},
  'object': {'id': '7924'},
  'qualifiers': []},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P136'},
  'object': {'id': 'Q10992055'},
  'qualifiers': []},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P136'},
  'object': {'id': 'Q1350410'},
  'qualifiers': []},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P136'},
  'object': {'id': 'Q8261'},
  'qualifiers': []},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P156'},
  'object': {'id': 'Q1026792'},
  'qualifiers': []},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P166'},
  'object': {'id': 'Q20899118'},
  'qualifiers': [{'qualifier_predicate': {'id': 'P1545'},
    'qualifier_object': {'id': '55'}}]},
 {'entity': {'id': 'Q967268'},
  'predicate': {'id': 'P1705'},
  'object': {'id': 'The Last Unicorn@en'},
  'qualifiers': 

In [101]:
cx.wd.wikidata_id_to_label("P123")

'publisher'

In [103]:
cx.wd.name_to_wikidata_ids("The Last Unicorn")

['Q176198', 'Q967268', 'Q7746145', 'Q30060419', 'Q15628943']