In [1]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
nlp = spacy.load("en_core_web_lg")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
doc = nlp('dog ate banana')

for token in doc:
    print(token.text, "vector:", token.has_vector, "OOV:", token.is_oov)

dog vector: True OOV: False
ate vector: True OOV: False
banana vector: True OOV: False


In [4]:
doc[0].vector.shape

(300,)

In [6]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [10]:
doc = nlp("sandwich burger flour car tiger human wheat tea")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
flour <-> bread: 0.6901002041941764
car <-> bread: 0.06451533308853552
tiger <-> bread: 0.04764611675903374
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.6150360888607199
tea <-> bread: 0.4239781171001145


In [11]:
def similarity(base_word, words_to_compare):

    base_token = nlp(base_word)
    words = nlp(words_to_compare)

    for token in words:
        print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))


In [14]:
similarity("iphone", "android samsung huawei motorola")

android <-> iphone: 0.6740653922185394
samsung <-> iphone: 0.670859081425417
huawei <-> iphone: 0.42858754172104974
motorola <-> iphone: 0.4454689862093514


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

king = nlp.vocab['king'].vector
queen = nlp.vocab['queen'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

result = king - man + woman


cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)