<a href="https://colab.research.google.com/github/PrabaKDataScience/DeepLearning/blob/main/NLP/Basics/07_spacy_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import spacy 

# Download either the medium english model / large english model - because it has the pipeline tok2vec

#!python -m spacy download en_core_web_lg

nlp = spacy.load('en_core_web_lg')

In [21]:
doc = nlp("I want soem oranges")

for token in doc:
  print(token.text , token.has_vector ,token.is_oov)

I True False
want True False
soem False True
oranges True False


In [22]:
doc[0].vector.shape

# Because the en_core_web_lg model output each token as 300 dimensioned feature vector

(300,)

In [23]:
base_token = nlp("bread")
base_token.vector.shape  # If you have only one word -- no need to index

(300,)

In [24]:
# Finding the cosine similarity after converting each word as a feature vector

doc = nlp("bread sandwich burger car tiger human wheat")
for token in doc:
    # print(type(token))
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451533308853552
tiger <-> bread: 0.04764611675903374
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.6150360888607199


In [19]:
# We can get a vector corresponding to any word (beware that model knows only trained words)

king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [25]:
# We have to verify whether the result vector is similar to queen 

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)

✨ The result is above 0.5 , By convention if the similarity is above 0.5 it is considered to be semantic matching ✨