**data process**

In [1]:
import io 
import os 
import sys 
import requests 
from collections import OrderedDict 
import math 
import random 
import numpy as np 
import paddle 
from paddle.nn import Embedding 
import paddle.nn.functional as F 
import paddle.nn as nn 

In [2]:
def download():
    corpus_url = "https://dataset.bj.bcebos.com/word2vec/text8.txt" 
    web_request = requests.get(corpus_url) 
    corpus = web_request.content 
    with open("./text8.txt", "wb") as f: 
        f.write(corpus) 
    f.close()

download()

In [4]:
def load_text8():
    with open("./text8.txt", "r") as f:
        corpus = f.read().strip("\n") 
    f.close() 
    return corpus 

corpus = load_text8()
print(corpus[:500])

 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philoso


In [5]:
def data_preprocess(corpus):
    corpus = corpus.strip().lower() 
    corpus = corpus.split(" ") 
    return corpus 

corpus = data_preprocess(corpus) 
print(corpus[:500])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers', 'to', 'related', 'so

In [6]:
def build_dict(corpus):
    word_freq_dict = dict() 
    for word in corpus: 
        if word not in word_freq_dict:
            word_freq_dict[word] = 0 
        word_freq_dict[word] += 1 
    
    word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse=True) 
    
    word2id_dict = dict() 
    word2id_freq = dict() 
    id2word_dict = dict() 

    for word, freq in word_freq_dict:
        curr_id = len(word2id_dict) 
        word2id_dict[word] = curr_id 
        word2id_freq[word2id_dict[word]] = freq 
        id2word_dict[curr_id] = word 
    
    return word2id_freq, word2id_dict, id2word_dict  

word2id_freq, word2id_dict, id2word_dict = build_dict(corpus) 
vocab_size = len(word2id_freq) 
print("there are totally %d different words in the corpus" %vocab_size) 
for _, (word, word_id) in zip(range(50), word2id_dict.items()):
    print("word %s, its id %d, its word freq %d" % (word, word_id, word2id_freq[word_id]))

there are totally 253854 different words in the corpus
word the, its id 0, its word freq 1061396
word of, its id 1, its word freq 593677
word and, its id 2, its word freq 416629
word one, its id 3, its word freq 411764
word in, its id 4, its word freq 372201
word a, its id 5, its word freq 325873
word to, its id 6, its word freq 316376
word zero, its id 7, its word freq 264975
word nine, its id 8, its word freq 250430
word two, its id 9, its word freq 192644
word is, its id 10, its word freq 183153
word as, its id 11, its word freq 131815
word eight, its id 12, its word freq 125285
word for, its id 13, its word freq 118445
word s, its id 14, its word freq 116710
word five, its id 15, its word freq 115789
word three, its id 16, its word freq 114775
word was, its id 17, its word freq 112807
word by, its id 18, its word freq 111831
word that, its id 19, its word freq 109510
word four, its id 20, its word freq 108182
word six, its id 21, its word freq 102145
word seven, its id 22, its word

In [7]:
def convert_corpus_to_id(corpus, word2id_dict):
    corpus = [word2id_dict[word] for word in corpus]
    return corpus 

corpus = convert_corpus_to_id(corpus, word2id_dict) 
print(corpus[:50])

[5233, 3080, 11, 5, 194, 1, 3133, 45, 58, 155, 127, 741, 476, 10571, 133, 0, 27349, 1, 0, 102, 854, 2, 0, 15067, 58112, 1, 0, 150, 854, 3580, 0, 194, 10, 190, 58, 4, 5, 10712, 214, 6, 1324, 104, 454, 19, 58, 2731, 362, 6, 3672, 0]


In [8]:
def subsampling(corpus, word2id_freq):
    def discard(word_id):
        return random.uniform(0, 1) < 1 - math.sqrt(
            1e-4 / word2id_freq[word_id] * len(corpus) 
        )
    corpus = [word for word in corpus if not discard(word)] 
    return corpus 

corpus = subsampling(corpus, word2id_freq) 
print("%d tokens in the corpus" % len(corpus)) 
print(corpus[:50])

8746467 tokens in the corpus
[5233, 3080, 3133, 741, 476, 10571, 27349, 1, 854, 15067, 58112, 854, 3580, 194, 58, 10712, 1324, 104, 454, 58, 2731, 362, 6, 3672, 708, 539, 11, 1423, 2757, 18, 686, 7088, 5233, 1052, 320, 248, 44611, 2877, 792, 5233, 200, 602, 1134, 2621, 8983, 279, 4147, 141, 6437, 4186]


In [9]:
def build_data(corpus, word2id_dict, word2id_freq, max_window_size = 3, negative_sample_num = 4):

    dataset = [] 

    for center_word_idx in range(len(corpus)):
        window_size = random.randint(1, max_window_size) 
        center_word = corpus[center_word_idx] 

        positive_word_range = (max(0, center_word_idx - window_size), min(len(corpus) - 1, center_word_idx + window_size)) 
        positive_word_candidates = [corpus[idx] for idx in range(positive_word_range[0], positive_word_range[1]+1) if idx!=center_word_idx] 

        for positive_word in positive_word_candidates:
            dataset.append((center_word, positive_word, 1))
        
            i = 0 
            while i < negative_sample_num:
                negative_word_candidate = random.randint(0, vocab_size - 1) 
                if negative_word_candidate not in positive_word_candidates:
                    dataset.append((center_word, negative_word_candidate, 0)) 
                    i += 1 
    
    return dataset

corpus_light = corpus[:int(len(corpus)*0.2)]
dataset = build_data(corpus_light, word2id_dict, word2id_freq) 
for _, (center_word, target_word, label) in zip(range(50), dataset):
    print("center_word %s, target %s, label %d" % (id2word_dict[center_word], id2word_dict[target_word], label))

center_word anarchism, target originated, label 1
center_word anarchism, target hydroxyprogesterone, label 0
center_word anarchism, target volatilize, label 0
center_word anarchism, target drp, label 0
center_word anarchism, target maimana, label 0
center_word anarchism, target abuse, label 1
center_word anarchism, target jondo, label 0
center_word anarchism, target chock, label 0
center_word anarchism, target bmuld, label 0
center_word anarchism, target muirthemne, label 0
center_word originated, target anarchism, label 1
center_word originated, target electrocutions, label 0
center_word originated, target kuperjanov, label 0
center_word originated, target pueblos, label 0
center_word originated, target planetographic, label 0
center_word originated, target abuse, label 1
center_word originated, target standardised, label 0
center_word originated, target segregationalist, label 0
center_word originated, target discontinuous, label 0
center_word originated, target mississippian, label 

In [18]:
def build_batch(dataset, batch_size, epoch_num):
    center_word_batch = [] 
    target_word_batch = [] 
    label_batch = [] 
    for epoch in range(epoch_num): 
        random.shuffle(dataset) 
        for center_word, target_word, label in dataset: 
            center_word_batch.append([center_word]) 
            target_word_batch.append([target_word]) 
            label_batch.append(label) 

            if len(center_word_batch) == batch_size: 
                yield np.array(center_word_batch).astype("int64"), np.array(target_word_batch).astype("int64"), np.array(label_batch).astype("float32") 
                center_word_batch = [] 
                target_word_batch = [] 
                label_batch = [] 
            
    if len(center_word_batch) > 0: 
        yield np.array(center_word_batch).astype("int64"), np.array(target_word_batch).astype("int64"), np.array(label_batch).astype("float32") 

for _, batch in zip(range(10), build_batch(dataset, 128, 3)):
    print(batch) 
    break 

(array([[143978],
       [  6194],
       [   435],
       [   312],
       [   585],
       [   690],
       [  1155],
       [   251],
       [  1842],
       [  4181],
       [   990],
       [  5724],
       [    41],
       [  3788],
       [     0],
       [  1503],
       [  2465],
       [   376],
       [  6976],
       [ 33276],
       [ 32033],
       [   258],
       [  1310],
       [   394],
       [   150],
       [    61],
       [    42],
       [ 52168],
       [   826],
       [159185],
       [  1506],
       [   474],
       [  6831],
       [   771],
       [     6],
       [  1983],
       [  8017],
       [  4552],
       [   105],
       [  1601],
       [    59],
       [   841],
       [  1330],
       [  1000],
       [  2038],
       [  1506],
       [   279],
       [  1888],
       [    65],
       [ 10180],
       [  3417],
       [    71],
       [    42],
       [   197],
       [  1164],
       [    66],
       [ 23540],
       [   390],
       [  213

**model construction**

In [19]:
class SkipGram(nn.Layer):
    def __init__(self, vocab_size, embedding_size, init_scale=0.1):
        super(SkipGram, self).__init__() 
        self.vocab_size = vocab_size 
        self.embedding_size = embedding_size 
        self.embedding = Embedding(
            num_embeddings = self.vocab_size, 
            embedding_dim = self.embedding_size, 
            weight_attr = paddle.ParamAttr(
                initializer = paddle.nn.initializer.Uniform(
                    low = -init_scale, 
                    high = init_scale
                )
            )
        )
        self.embedding_out = Embedding(
            num_embeddings = self.vocab_size, 
            embedding_dim = self.embedding_size, 
            weight_attr = paddle.nn.initializer.Uniform(
                low = -init_scale, 
                high = init_scale
            )
        )
    
    def forward(self, center_words, target_words, label):
        center_words_emb = self.embedding(center_words) 
        target_words_emb = self.embedding_out(target_words) 
        word_sim = paddle.multiply(center_words_emb, target_words_emb) 
        word_sim = paddle.sum(word_sim, axis=-1) 
        word_sim = paddle.reshape(word_sim, shape=[-1]) 
        pred = F.sigmoid(word_sim) 
        loss = F.binary_cross_entropy_with_logits(word_sim, label) 
        loss = paddle.mean(loss) 
        return pred, loss


In [20]:
batch_size = 512 
epoch_num = 3 
embedding_size = 200 
step = 0 
learning_rate = 1e-3

def get_similar_tokens(query_token, k, embed): 
    W = embed.numpy() 
    x = W[word2id_dict[query_token]] 
    cos = np.dot(W, x) / np.sqrt(np.sum(W * W, axis=1) * np.sum(x * x) + 1e-9)
    flat = cos.flatten() 
    indices = np.argpartition(flat, -k)[-k:] 
    indices = indices[np.argsort(-flat[indices])] 
    for i in indices: 
        print('for word %s, the similar word is %s' % (query_token, str(id2word_dict[i]))) 

skip_gram_model = SkipGram(vocab_size, embedding_size) 
adam = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=skip_gram_model.parameters()) 

for center_words, target_words, label in build_batch(dataset, batch_size, epoch_num):
    center_words_var = paddle.to_tensor(center_words) 
    target_words_var = paddle.to_tensor(target_words) 
    label_var = paddle.to_tensor(label) 

    pred, loss = skip_gram_model(center_words_var, target_words_var, label_var) 
    loss.backward() 
    adam.step() 
    adam.clear_grad() 

    step += 1 
    if step % 1000 == 0:
        print("step %d, loss %.3f" % (step, loss.numpy()[0])) 
    if step % 10000 == 0:
        get_similar_tokens('movie', 5, skip_gram_model.embedding.weight) 
        get_similar_tokens('one', 5, skip_gram_model.embedding.weight) 
        get_similar_tokens('chip', 5, skip_gram_model.embedding.weight)         

W0520 16:12:37.662643 11747 gpu_context.cc:278] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0520 16:12:37.667212 11747 gpu_context.cc:306] device: 0, cuDNN Version: 7.6.


step 1000, loss 0.692
step 2000, loss 0.686
step 3000, loss 0.628
step 4000, loss 0.514
step 5000, loss 0.410
step 6000, loss 0.286
step 7000, loss 0.251
step 8000, loss 0.249
step 9000, loss 0.265
step 10000, loss 0.205
for word movie, the similar word is movie
for word movie, the similar word is fit
for word movie, the similar word is encoding
for word movie, the similar word is abundant
for word movie, the similar word is reached
for word one, the similar word is one
for word one, the similar word is nine
for word one, the similar word is people
for word one, the similar word is fleet
for word one, the similar word is scholars
for word chip, the similar word is chip
for word chip, the similar word is varying
for word chip, the similar word is independence
for word chip, the similar word is relations
for word chip, the similar word is attend
step 11000, loss 0.198
step 12000, loss 0.232
step 13000, loss 0.193
step 14000, loss 0.164
step 15000, loss 0.223
step 16000, loss 0.235
step 1

**GloVe**

In [22]:
from collections import Counter 
from itertools import chain 
import numpy as np 
import os 
import time 
from multiprocessing import pool, cpu_count 

In [None]:
class CoOccur:
    def __init__(self):
        self.cooccur = {} 
    
    def pair(self, w1, w2, dis):
        if w1 in self.cooccur.keys():
            if w2 in self.cooccur[w1].keys():
                self.cooccur[w1][w2] += 1.0 / dis 
            else:
                self.cooccur[w1][w2] = 1.0 / dis 
        else:
            self.cooccur[w1] = {} 
            self.cooccur[w1][w2] = 1.0 / dis 
    
    def check(self, w1, w2):
        if w1 in self.cooccur.keys():
            if w2 in self.cooccur[w1].keys():
                return self.cooccur[w1][w2] 
        else:
            self.cooccur[w1] = {} 
            self.cooccur[w1][w2] = 1.0 / dis 
    
    def get_pairs(self):
        if self.num_saved > 0: 
            for i in range(1, self.num_saved + 1):
                f = open(self.cache_path + '/buffer2bin_' + str(i) + '.bin')

**visualization**

In [2]:
from paddlenlp.embeddings import TokenEmbedding 

token_embedding = TokenEmbedding(embedding_name="w2v.baidu_encyclopedia.target.word-word.dim300") 

print(token_embedding)

100%|██████████| 694483/694483 [00:44<00:00, 15677.96it/s]
[2022-05-20 17:30:04,131] [    INFO] - Loading token embedding...
W0520 17:30:08.293259 20581 gpu_context.cc:278] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0520 17:30:08.297292 20581 gpu_context.cc:306] device: 0, cuDNN Version: 7.6.
[2022-05-20 17:30:12,404] [    INFO] - Finish loading embedding vector.
[2022-05-20 17:30:12,407] [    INFO] - Token Embedding info:             
Unknown index: 635963             
Unknown token: [UNK]             
Padding index: 635964             
Padding token: [PAD]             
Shape :[635965, 300]


Object   type: TokenEmbedding(635965, 300, padding_idx=635964, sparse=False)             
Unknown index: 635963             
Unknown token: [UNK]             
Padding index: 635964             
Padding token: [PAD]             
Parameter containing:
Tensor(shape=[635965, 300], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [[-0.24200200,  0.13931701,  0.07378800, ...,  0.14103900,
          0.05592300, -0.08004800],
        [-0.08671700,  0.07770800,  0.09515300, ...,  0.11196400,
          0.03082200, -0.12893000],
        [-0.11436500,  0.12201900,  0.02833000, ...,  0.11068700,
          0.03607300, -0.13763499],
        ...,
        [ 0.02628800, -0.00008300, -0.00393500, ...,  0.00654000,
          0.00024600, -0.00662600],
        [ 0.00930271, -0.00925986, -0.00438002, ...,  0.01524009,
         -0.00773855,  0.00857220],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])


In [12]:
!pip show visualdl

Name: visualdl
Version: 2.2.3
Summary: Visualize Deep Learning
Home-page: UNKNOWN
Author: PaddlePaddle and Echarts team
Author-email: 
License: Apache License
Location: /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages
Requires: bce-python-sdk, flake8, flask, Flask-Babel, matplotlib, numpy, pandas, Pillow, pre-commit, protobuf, requests, shellcheck-py, six
Required-by: paddlehub


In [3]:
test_token_embedding = token_embedding.search("中国") 
print(test_token_embedding)

[[ 0.260801  0.1047    0.129453 -0.257317 -0.16152   0.19567  -0.074868
   0.361168  0.245882 -0.219141 -0.388083  0.235189  0.029316  0.154215
  -0.354343  0.017746  0.009028  0.01197  -0.121429  0.096542  0.009255
   0.039721  0.363704 -0.239497 -0.41168   0.16958   0.261758  0.022383
  -0.053248 -0.000994 -0.209913 -0.208296  0.197332 -0.3426   -0.162112
   0.134557 -0.250201  0.431298  0.303116  0.517221  0.243843  0.022219
  -0.136554 -0.189223  0.148563 -0.042963 -0.456198  0.14546  -0.041207
   0.049685  0.20294   0.147355 -0.206953 -0.302796 -0.111834  0.128183
   0.289539 -0.298934 -0.096412  0.063079  0.324821 -0.144471  0.052456
   0.088761 -0.040925 -0.103281 -0.216065 -0.200878 -0.100664  0.170614
  -0.355546 -0.062115 -0.52595  -0.235442  0.300866 -0.521523 -0.070713
  -0.331768  0.023021  0.309111 -0.125696  0.016723 -0.0321   -0.200611
   0.057294 -0.128891 -0.392886  0.423002  0.282569 -0.212836  0.450132
   0.067604 -0.124928 -0.294086  0.136479  0.091505 -0.061723 -0

In [4]:
score1 = token_embedding.cosine_sim("女孩", "女人")
score2 = token_embedding.cosine_sim("女孩", "书籍") 
print("score1:", score1) 
print("socre2:", score2) 

score1: 0.7017183
socre2: 0.19189896


In [10]:
labels = token_embedding.vocab.to_tokens(list(range(0, 1))) 
test_token_embedding = token_embedding.search(labels) 
from visualdl import LogWriter 
with LogWriter(logdir='./token_hidi') as writer: 
    writer.add_embeddings(tag="test", mat=[i for i in test_token_embedding], metadata=labels)