In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')


In [3]:
text_minister = "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013."


In [4]:

# Process whole documents
text_Sebastian = (u"When Sebastian Thrun started working on self-driving cars at "
        u"Google in 2007, few people outside of the company took him "
        u"seriously. “I can tell you very senior CEOs of major American "
        u"car companies would shake my hand and turn away because I wasn’t "
        u"worth talking to,” said Thrun, now the co-founder and CEO of "
        u"online higher education startup Udacity, in an interview with "
        u"Recode earlier this week.")


In [5]:
text_deeplearning = (u"Yann LeCun, Yoshua Bengio and Geoffrey Hinton. Deep learning allows computational models that are composed of multiple processing layers to learn representations of data with multiple levels of abstraction. These methods have dramatically improved the state-of-the-art in speech recognition, visual object recognition, object detection and many other domains such as drug discovery and genomics. Deep learning discovers intricate structure in large data sets by using the backpropagation algorithm to indicate how a machine should change its internal parameters that are used to compute the representation in each layer from the representation in the previous layer. Deep convolutional nets have brought about breakthroughs in processing images, video, speech and audio, whereas recurrent nets have shone light on sequential data such as text and speech.")

In [6]:
doc_Sebastian = nlp(text_Sebastian)

# Find named entities, phrases and concepts
for entity in doc_Sebastian.ents:
    print(entity.text, entity.label_)

Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [7]:
doc_minister = nlp(text_minister)

# Find named entities, phrases and concepts
for entity in doc_minister.ents:
    print(entity.text, entity.label_)

1986 to 1988 DATE
38 CARDINAL
one CARDINAL
half an hour TIME
Jim Hacker PERSON
BBC Radio ORG
2010 DATE
UKTV Gold ORG
2013 DATE


In [8]:
doc_deeplearning = nlp(text_deeplearning)

# Find named entities, phrases and concepts
for entity in doc_deeplearning.ents:
    print(entity.text, entity.label_)

Yann LeCun PERSON
Yoshua Bengio ORG
Geoffrey Hinton PERSON


In [9]:
from spacy import displacy
displacy.render(doc_minister, style='ent', jupyter=True)


In [10]:
print(doc_minister)

The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013.


In [11]:
type(doc_minister)

spacy.tokens.doc.Doc

In [12]:
doc_minister.sents

<generator at 0x7f88383c1ea0>

In [13]:
list_minister = list(doc_minister.sents)

In [14]:
print(len(list_minister))

4


In [15]:
doc1 = nlp(u"I like apple")
doc2 = nlp(u"deep learning")
similarity = doc1.similarity(doc2)
print(doc1.text, doc2.text, similarity)

I like apple deep learning 0.3980880058564217


In [16]:
nlp= spacy.load('en_core_web_lg')

In [17]:
# print(nlp.vocab['minister'].vector)


In [18]:
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]

In [19]:
dog.similarity(cat)


0.8016855

In [20]:
from scipy.spatial.distance import cosine

In [21]:
# why 1-cosine? cosine = similarity? 
def vector_similarity(x,y):
    return 1-cosine(x, y)

In [22]:
def make_guess_word(words):
#    [first, second, third] = words
#    return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector
    first_word = words[0]
    second_word = words[1]
    third_word = words[2]
    return nlp.vocab[first_word].vector - nlp.vocab[second_word].vector + nlp.vocab[third_word].vector





In [23]:
def get_similar_word(words, scope=nlp.vocab):

    guess_word = make_guess_word(words)

    similarities = []

    for word in scope:
        if not word.has_vector:
            continue

        similarity = vector_similarity(guess_word, word.vector)
        similarities.append((word, similarity))
    
    similarities = sorted(similarities, key=lambda item: -item[1])
    print([word[0].text for word in similarities[:10]])


In [24]:
words_man = ["king", "queen", "woman"]
get_similar_word(words_man)

['MAN', 'Man', 'mAn', 'MAn', 'MaN', 'man', 'mAN', 'WOMAN', 'womAn', 'WOman']
