In [1]:
import collections
import d2lzh as d2l
import math
from mxnet import autograd,gluon,nd
from mxnet.gluon import data as gdata,loss as gloss,nn
import random
import sys
import time
import zipfile
import numpy as np

In [2]:
def skip_gram(center,context_and_negatives,embed_v,embed_u):
    v=embed_v(center)
    u=embed_u(context_and_negatives)
    pred=nd.batch_dot(v,u.swapaxes(1,2))
    return pred

In [3]:
loss=gloss.SigmoidBinaryCrossEntropyLoss()

In [4]:
def sigmd(x):
    return -math.log(1/(1+math.exp(-x)))

In [5]:
def read_file_data(filePath):
    f=open(filePath)
    lines=f.readlines()
    raw_dataset=[st.split() for st in lines]
    #for st in raw_dataset[:3]:
     #   print('# tokens:',len(st),st[:5])    
    counter=collections.Counter([tk for st in raw_dataset for tk in st])
    #counter=dict(filter(lambda x:x[1]>=5,counter.items()))
    idx_to_token=[tk for tk,_ in counter.items()]
    token_to_idx={tk:idx for idx, tk in enumerate(idx_to_token)}
    dataset=[[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
    num_tokens=sum([len(st) for st in dataset])
    return idx_to_token,token_to_idx,dataset,counter,num_tokens

In [None]:
idx_to_token,token_to_idx,dataset,counter,num_tokens=read_file_data("data/wordTrain/train_cnn.lex")
print(num_tokens)
print(dataset[0:10])
token_to_idx

In [80]:
def discard(idx):
    return random.uniform(0,1)<1-math.sqrt(1e-4/counter[idx_to_token[idx]]*num_tokens)

In [81]:
subsampled_dataset=[[tk for tk in st if not discard(tk)] for st in dataset]
subsampled_dataset2=[x for x in subsampled_dataset if x!=[]]
print(subsampled_dataset[0:10])
num_tokens=sum([len(st) for st in subsampled_dataset])
print(subsampled_dataset2[0:10])
print(num_tokens)

[[], [3, 4], [], [], [], [], [13], [], [], []]
[[3, 4], [13], [19], [21], [25], [26], [26], [5], [10], [21]]
17166


In [82]:
def get_centers_and_contexts(dataset,max_windows_size):
    centers,contexts=[],[]
    for st in dataset:
        if len(st)<2:
            continue
        centers+=st
        for center_i in range(len(st)):
            windows_size=random.randint(1,max_windows_size)
            indices=list(range(max(0,center_i-windows_size),min(len(st),center_i+1+windows_size)))
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers,contexts

In [83]:
all_centers,all_contexts=get_centers_and_contexts(subsampled_dataset,5)

In [84]:
def get_negatives(all_contexts,sampling_weights,K):
    all_negatives,neg_candidates,i=[],[],0
    population=list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives=[]
        while len(negatives)<len(contexts)*K:
            if i==len(neg_candidates):
                i,neg_candidates=0,random.choices(population,sampling_weights,k=int(1e5))
            neg,i=neg_candidates[i],i+1
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

In [85]:
sampling_weights=[counter[W]**0.75 for W in idx_to_token]
all_negatives=get_negatives(all_contexts,sampling_weights,5)
#all_negatives

In [86]:
def batchify(data):
    max_len=max(len(c)+len(n) for _,c,n in data)
    centers,contexts_negatives,masks,labels=[],[],[],[]
    for center,context,negatives in data:
        cur_len=len(context)+len(negatives)
        centers+=[center]
        contexts_negatives+=[context+negatives+[0]*(max_len-cur_len)]
        masks+=[[1]*cur_len+[0]*(max_len-cur_len)]
        labels+=[[1]*len(context)+[0]*(max_len-len(context))]
    return (nd.array(centers).reshape((-1,1)),nd.array(contexts_negatives),nd.array(masks),nd.array(labels))

In [87]:
batch_size=512
num_workers=0 if sys.platform.startswith('win32') else 4
dataset=gdata.ArrayDataset(all_centers,all_contexts,all_negatives)
data_iter=gdata.DataLoader(dataset,batch_size,shuffle=True,batchify_fn=batchify,num_workers=num_workers)

In [88]:
embed_size=20
net=nn.Sequential()
net.add(nn.Embedding(input_dim=len(idx_to_token),output_dim=embed_size),
        nn.Embedding(input_dim=len(idx_to_token),output_dim=embed_size))

In [89]:
def train(net,lr,num_epochs):
    ctx=d2l.try_gpu()
    net.initialize(ctx=ctx,force_reinit=True)
    trainer=gluon.Trainer(net.collect_params(),'adam',{'learning_rate':lr})
    for epoch in range(num_epochs):
        start,l_sum,n=time.time(),0.0,0
        for batch in data_iter:
            center,context_negative,mask,label=[data.as_in_context(ctx) for data in batch]
            with autograd.record():
                pred=skip_gram(center,context_negative,net[0],net[1])
                l=(loss(pred.reshape(label.shape),label,mask)*mask.shape[1]/mask.sum(axis=1))
            l.backward()
            trainer.step(batch_size)
            l_sum+=l.sum().asscalar()
            n+=l.size
        print('epoch %d, loss %.2f,time %.2fs'%(epoch+1,l_sum/n,time.time()-start))

In [90]:
train(net,0.005,100)

epoch 1, loss 0.69,time 0.23s
epoch 2, loss 0.64,time 0.22s
epoch 3, loss 0.55,time 0.20s
epoch 4, loss 0.46,time 0.20s
epoch 5, loss 0.43,time 0.20s
epoch 6, loss 0.43,time 0.22s
epoch 7, loss 0.43,time 0.21s
epoch 8, loss 0.42,time 0.21s
epoch 9, loss 0.42,time 0.24s
epoch 10, loss 0.42,time 0.21s
epoch 11, loss 0.42,time 0.19s
epoch 12, loss 0.42,time 0.22s
epoch 13, loss 0.41,time 0.22s
epoch 14, loss 0.41,time 0.19s
epoch 15, loss 0.40,time 0.21s
epoch 16, loss 0.40,time 0.19s
epoch 17, loss 0.40,time 0.20s
epoch 18, loss 0.39,time 0.20s
epoch 19, loss 0.39,time 0.22s
epoch 20, loss 0.38,time 0.21s
epoch 21, loss 0.38,time 0.20s
epoch 22, loss 0.38,time 0.20s
epoch 23, loss 0.38,time 0.22s
epoch 24, loss 0.38,time 0.20s
epoch 25, loss 0.37,time 0.21s
epoch 26, loss 0.37,time 0.22s
epoch 27, loss 0.37,time 0.20s
epoch 28, loss 0.37,time 0.20s
epoch 29, loss 0.37,time 0.21s
epoch 30, loss 0.37,time 0.20s
epoch 31, loss 0.36,time 0.19s
epoch 32, loss 0.36,time 0.21s
epoch 33, loss 0.

In [91]:
def get_similar_tokens(query_token,k,embed):
    W=embed.weight.data()
    x=W[token_to_idx[query_token]]
    cos=nd.dot(W,x)/(nd.sum(W*W,axis=1)*nd.sum(x*x)+1e-9).sqrt()
    topk=nd.topk(cos,k=k+1,ret_typ='indices').asnumpy().astype('int32')
    for i in topk[1:]:
        print('cosine sim=%.3f:%s'%(cos[i].asscalar(),(idx_to_token[i])))

In [96]:
get_similar_tokens('union',20,net[0])

cosine sim=0.826:struct
cosine sim=0.817:enum_name
cosine sim=0.794:enum
cosine sim=0.767:typedef
cosine sim=0.706:struct_name
cosine sim=0.702:*
cosine sim=0.682:struct_var
cosine sim=0.562:fflush
cosine sim=0.537:void
cosine sim=0.526:sizeof
cosine sim=0.499:>=
cosine sim=0.499:NULL
cosine sim=0.492:malloc
cosine sim=0.491:;
cosine sim=0.483:default
cosine sim=0.479:gets
cosine sim=0.450:int
cosine sim=0.447:nums
cosine sim=0.446:system
cosine sim=0.441:case


In [None]:
#net.save('data/params/word.txt')

In [99]:
embedding_weight=net[0].weight.data()

In [34]:
embedding_weight


[[-2.86826879e-01  1.37585759e+00  5.37829876e-01 ...  8.88895154e-01
   1.62632823e+00 -3.25835109e-01]
 [ 3.35133940e-01  4.04355735e-01  5.16022980e-01 ... -3.15721005e-01
   5.05286396e-01  2.00586870e-01]
 [ 7.76588678e-01 -6.52833104e-01  1.92166150e-01 ... -1.72246709e-01
   8.80316377e-01  1.22809805e-01]
 ...
 [ 4.56892431e-01  1.08795595e+00  4.52337384e-01 ...  1.32460713e+00
   3.06503594e-01 -1.73381716e-01]
 [-2.68639885e-02  4.54093441e-02  1.17027760e-02 ... -3.05141620e-02
   6.58679232e-02  1.15022808e-02]
 [ 3.63023579e-02 -7.32243061e-05 -1.62049942e-02 ... -3.26242186e-02
  -4.65029925e-02 -5.80759011e-02]]
<NDArray 70x20 @gpu(0)>

In [8]:
def save_embedding_params(filePath,idx_to_token,embedding_weight):
    temp_one=np.array(idx_to_token)
    np.save(filePath+'idx_to_token.npy',temp_one)
    #temp_two=np.array(token_to_idx)
    #np.save(filePath+'token_to_idx.npy',temp_two)
    nd.save(filePath+'embedding_weight.txt',embedding_weight)
    return;
def save_idx_to_token(filePath,idx_to_token):
    temp_one=np.array(idx_to_token)
    np.save(filePath+'idx_to_token.npy',temp_one)

In [102]:
def read_embedding_params(filePath):
    idx_to_token=np.load(filePath+'idx_to_token.npy')
    #token_to_idx=np.load(filePath+'token_to_idx.npy')
    token_to_idx={tk:idx for idx,tk in enumerate(idx_to_token)}
    embedding_weight=nd.load(filePath+'embedding_weight.txt')
    return idx_to_token,token_to_idx,embedding_weight

In [100]:
save_embedding_params('data/params/word2Vec/',idx_to_token,embedding_weight)

In [103]:
temp_idx_to_token,temp_token_to_idx,temp_embedding_weight=read_embedding_params('data/params/word2Vec/')

In [104]:
print(temp_idx_to_token)
print(idx_to_token)

['#define' 'var' 'nums' 'enum' 'enum_name' '{' ',' '}' 'struct' 'char' '['
 ']' ';' 'float' 'int' 'struct_name' 'struct_var' '*' 'void' 'func' '('
 ')' 'printf' 'words' 'while' 'switch' 'case' ':' 'call_func' 'break'
 'fflush' 'stdin' 'gets' '=' 'if' '<' '|' '>' 'else' 'return' 'FILE'
 'fopen' 'for' 'fread' '+' 'sizeof' '!' '.' 'fclose' 'NULL' '-' 'do'
 'continue' 'strcmp' '&' 'scanf' 'malloc' 'fwrite' 'double' 'default'
 'getch' 'feof' 'system' '<=' 'typedef' '>=' 'data_type' 'free' '%' '/'
 'memset' 'static' 'goto' 'extern' '#endif' '#ifndef' 'assert' 'union'
 '#undef' '#else']
['#define', 'var', 'nums', 'enum', 'enum_name', '{', ',', '}', 'struct', 'char', '[', ']', ';', 'float', 'int', 'struct_name', 'struct_var', '*', 'void', 'func', '(', ')', 'printf', 'words', 'while', 'switch', 'case', ':', 'call_func', 'break', 'fflush', 'stdin', 'gets', '=', 'if', '<', '|', '>', 'else', 'return', 'FILE', 'fopen', 'for', 'fread', '+', 'sizeof', '!', '.', 'fclose', 'NULL', '-', 'do', 'continue'

In [105]:
#temp_token_to_idx={tk:idx for idx,tk in enumerate(temp_idx_to_token)}
print(temp_token_to_idx)

{'#define': 0, 'var': 1, 'nums': 2, 'enum': 3, 'enum_name': 4, '{': 5, ',': 6, '}': 7, 'struct': 8, 'char': 9, '[': 10, ']': 11, ';': 12, 'float': 13, 'int': 14, 'struct_name': 15, 'struct_var': 16, '*': 17, 'void': 18, 'func': 19, '(': 20, ')': 21, 'printf': 22, 'words': 23, 'while': 24, 'switch': 25, 'case': 26, ':': 27, 'call_func': 28, 'break': 29, 'fflush': 30, 'stdin': 31, 'gets': 32, '=': 33, 'if': 34, '<': 35, '|': 36, '>': 37, 'else': 38, 'return': 39, 'FILE': 40, 'fopen': 41, 'for': 42, 'fread': 43, '+': 44, 'sizeof': 45, '!': 46, '.': 47, 'fclose': 48, 'NULL': 49, '-': 50, 'do': 51, 'continue': 52, 'strcmp': 53, '&': 54, 'scanf': 55, 'malloc': 56, 'fwrite': 57, 'double': 58, 'default': 59, 'getch': 60, 'feof': 61, 'system': 62, '<=': 63, 'typedef': 64, '>=': 65, 'data_type': 66, 'free': 67, '%': 68, '/': 69, 'memset': 70, 'static': 71, 'goto': 72, 'extern': 73, '#endif': 74, '#ifndef': 75, 'assert': 76, 'union': 77, '#undef': 78, '#else': 79}


In [106]:
print(token_to_idx)

{'#define': 0, 'var': 1, 'nums': 2, 'enum': 3, 'enum_name': 4, '{': 5, ',': 6, '}': 7, 'struct': 8, 'char': 9, '[': 10, ']': 11, ';': 12, 'float': 13, 'int': 14, 'struct_name': 15, 'struct_var': 16, '*': 17, 'void': 18, 'func': 19, '(': 20, ')': 21, 'printf': 22, 'words': 23, 'while': 24, 'switch': 25, 'case': 26, ':': 27, 'call_func': 28, 'break': 29, 'fflush': 30, 'stdin': 31, 'gets': 32, '=': 33, 'if': 34, '<': 35, '|': 36, '>': 37, 'else': 38, 'return': 39, 'FILE': 40, 'fopen': 41, 'for': 42, 'fread': 43, '+': 44, 'sizeof': 45, '!': 46, '.': 47, 'fclose': 48, 'NULL': 49, '-': 50, 'do': 51, 'continue': 52, 'strcmp': 53, '&': 54, 'scanf': 55, 'malloc': 56, 'fwrite': 57, 'double': 58, 'default': 59, 'getch': 60, 'feof': 61, 'system': 62, '<=': 63, 'typedef': 64, '>=': 65, 'data_type': 66, 'free': 67, '%': 68, '/': 69, 'memset': 70, 'static': 71, 'goto': 72, 'extern': 73, '#endif': 74, '#ifndef': 75, 'assert': 76, 'union': 77, '#undef': 78, '#else': 79}


In [9]:
save_idx_to_token('data/params/word2Vec/',idx_to_token)