In [49]:
"""
This is a practice about how to use Word2Vec as a Embedding layer.
Teng Li
30.09.2021
"""

'\nThis is a practice about how to use Word2Vec as a Embedding layer.\nTeng Li\n30.09.2021\n'

In [1]:
'''
First of all Let's try to build a Embedding layer.
'''
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

word_to_ix = {"Li": 0, "Teng": 1} # init a dictionary
embeds = nn.Embedding(2, 3)  # 2 words in vocab, 3 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["Li"]], dtype=torch.long)
Li_embed = embeds(lookup_tensor)
print(Li_embed)

tensor([[-0.6253,  1.0858, -0.7127]], grad_fn=<EmbeddingBackward>)


In [2]:
# we can see the weights of Embedding layers
print(embeds.weight)

Parameter containing:
tensor([[-0.6253,  1.0858, -0.7127],
        [-0.1916,  0.6963, -1.6411]], requires_grad=True)


In [3]:
# Now Let's try to init our Embedding layer with a known weights
w = torch.FloatTensor([[1.1, 2.8, 8],
                       [3, 4, 5]])
embeds = nn.Embedding.from_pretrained(w)
Li_embed = embeds(lookup_tensor)
print(Li_embed)

tensor([[1.1000, 2.8000, 8.0000]])


In [None]:
# now let's get the weights from trained model(word2vec here)
# first of all we need to download word2vec
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [4]:
import gensim
# then we can load word2vec
model = gensim.models.KeyedVectors.load_word2vec_format('/home/teng/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz',
                                                        binary=True,limit=1000)

In [5]:
# we can also lookup a vector of word
love_vec = model['love']
print(love_vec.shape)
#print(love_vec)
# lets see which words are most similar to 'love'
#love_similar = model.most_similar(['love'],topn=5)
#print(love_similar)
# which words are similar to 'king'-'man'+'woman'?
#print(model.most_similar(positive=['king','woman'],negative=['man'],topn=5))
# and 'student' - 'book'
#print(model.most_similar(negative=['student','book'],topn=5))

(300,)


In [16]:
# the weights of Embedding layer are
W2V_weights = torch.FloatTensor(model.vectors)
# so the our Embedding layer could be
embeds = nn.Embedding.from_pretrained(W2V_weights)
# the vocab(word >index) is 
Vocab = model.wv.vocab
# e.g the words are 
t = 'boy fall in love with girl'
words = t.split()
print(words)
# the index of words can be found by
L = len(words)
words_id = torch.LongTensor(L)
for l in range(L):
    word = words[l]
    words_id[l] = torch.tensor(Vocab[word].index)
print('words_id:',words_id)
# and the words_vec are
words_vec = embeds(words_id)
print(words_vec.size())

['boy', 'fall', 'in', 'love', 'with', 'girl']


  


KeyError: 'boy'

In [17]:
print(Vocab['like'])
#print(type(Vocab))
# now try to use index to find word
Vocab_list = list(Vocab.keys())
print(Vocab_list[87])

Vocab(count:913, index:87)
like


In [154]:
# But we can find that some words are not in this Vocab
"'s" not in Vocab

True

In [84]:
# let's check which words not in vocab
def check_word(data,vocab):
    oov = set()
    oov_num = 0
    sum_num = len(data)
    for word in data:
        if word not in vocab:
            oov.add(word)
            oov_num += 1
    print('word not in vocab:{:.2%}'.format(oov_num/sum_num))
    print(oov)
# e.g the sentence
t = 'a boy fall in love with a girl'
s = t.split()
check_word(s,Vocab)

word not in vocab:25.00%
{'a'}


In [130]:
#so we need to clean this dataset
#we will remove all the punctuation and embedd all the unknow words with zero vector. 
#first add 'unk' to our model
model['unk'] = np.zeros(300,dtype=np.float32)
#print(model['unk'])
print(Vocab['unk'])
print(Vocab['boy'])

Vocab(count:1, index:10000)
Vocab(count:8444, index:1556)


In [137]:
'&' in Vocab

True

In [49]:
'''
def word2index(docs,vocab):
    for i,doc in enumerate(docs):
        L = len(doc)
        words_id = torch.LongTensor(L)
        for l in range(L):
            word = doc[l]
            words_id[l] = torch.tensor(vocab[word].index)
        print(words_id)
        docs[i] = words_id
    return docs
'''


In [1]:
d = ['boy','love','girl']
     

In [10]:
def word2index(doc):
    L = len(words)
    words_id = torch.LongTensor(L)
    for l in range(L):
        word = words[l]
        words_id[l] = torch.tensor(Vocab[word].index)
    print('words_id:',words_id)
    return words_id

In [11]:
print(word2index(d))

words_id: tensor([1556,  707,    1,  746,    8, 1408])
tensor([1556,  707,    1,  746,    8, 1408])


In [59]:
a = word2index(d,Vocab)

tensor([1556,  746, 1408])
tensor([6147,    8, 1556])
tensor([ 28,   4,  11, 889])


In [52]:
print(a)

[tensor([1556,  746, 1408]), tensor([6147,    8, 1556])]


In [53]:
print(type(a))

<class 'list'>
