<a href="https://colab.research.google.com/github/Q-nicorn1/Sentence-Word-Embedding-/blob/main/Doc2Vec%26SentenceBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DOC2Vec**

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

#input
sentences = ["Did you watch the movie Glass?", 
             "Be careful, I just broke a glass cup", 
             "This window is made of glass.",
             "He doesn't usually wear glasses."]

             
#tokenized sentences             
tokenized_sent = []
for s in sentences:
  tokenized_sent.append(word_tokenize(s.lower()))
tokenized_sent

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['did', 'you', 'watch', 'the', 'movie', 'glass', '?'],
 ['be', 'careful', ',', 'i', 'just', 'broke', 'a', 'glass', 'cup'],
 ['this', 'window', 'is', 'made', 'of', 'glass', '.'],
 ['he', 'does', "n't", 'usually', 'wear', 'glasses', '.']]

In [None]:
# Install TF-Hub&libraries
!pip3 install --upgrade tensorflow-gpu
!pip3 install tensorflow-hub

Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.7.0-cp37-cp37m-manylinux2010_x86_64.whl (489.6 MB)
[K     |████████████████████████████████| 489.6 MB 19 kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.7.0


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
tagged_data


[TaggedDocument(words=['did', 'you', 'watch', 'the', 'movie', 'glass', '?'], tags=[0]),
 TaggedDocument(words=['be', 'careful', ',', 'i', 'just', 'broke', 'a', 'glass', 'cup'], tags=[1]),
 TaggedDocument(words=['this', 'window', 'is', 'made', 'of', 'glass', '.'], tags=[2]),
 TaggedDocument(words=['he', 'does', "n't", 'usually', 'wear', 'glasses', '.'], tags=[3])]

In [None]:

model = Doc2Vec(tagged_data, vector_size = 15, window = 2, min_count = 1, epochs = 100)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

## Print model vocabulary
model.wv.vocab

{',': <gensim.models.keyedvectors.Vocab at 0x7f16e1ff0790>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec2d0>,
 '?': <gensim.models.keyedvectors.Vocab at 0x7f16e03ece10>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec090>,
 'be': <gensim.models.keyedvectors.Vocab at 0x7f16e03ecd50>,
 'broke': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec050>,
 'careful': <gensim.models.keyedvectors.Vocab at 0x7f16e1ff0410>,
 'cup': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec0d0>,
 'did': <gensim.models.keyedvectors.Vocab at 0x7f16e1ff0510>,
 'does': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec350>,
 'glass': <gensim.models.keyedvectors.Vocab at 0x7f16e03ece50>,
 'glasses': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec390>,
 'he': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec310>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec450>,
 'is': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec190>,
 'just': <gensim.models.keyedvectors.Vocab at 0x7f16e03ec250>,
 'made'

In [None]:
test_doc = word_tokenize("This glass cup is fragile".lower())
test_doc_vector = model.infer_vector(test_doc)
model.docvecs.most_similar(positive = [test_doc_vector])


[(0, -0.05732543021440506),
 (2, -0.07792545855045319),
 (1, -0.10838853567838669),
 (3, -0.13605502247810364)]

# **SENTENCEBERT**

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.9 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 16.2 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 23.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 417 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [None]:
#load model
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
sentence_embeddings = sbert_model.encode(sentences)

#print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))
#print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

In [None]:


#function that returns cosine similarity between the two vectors
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))



In [None]:
query = "This glass cup is fragile"
query_vec = sbert_model.encode([query])[0]

In [None]:
for sent in sentences:
  sim = cosine(query_vec, sbert_model.encode([sent])[0])
  print("Sentence = ", sent, "; similarity = ", sim)

Sentence =  Did you watch the movie Glass? ; similarity =  0.2366257
Sentence =  Be careful, I just broke a glass cup ; similarity =  0.46764034
Sentence =  This window is made of glass. ; similarity =  0.6351793
Sentence =  He doesn't usually wear glasses. ; similarity =  0.6418279
