In [2]:
import collections
import math
import random
import sys
import time
import os
import numpy as np

import tensorflow as tf 

sys.path.append("..")

In [6]:
path = 'D:/machine learning/Deep Learning/basic/data/ptb_train.txt'

with open(path, 'r') as f:
    lines = f.readlines()
    # st是sentence的缩写
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset) # 输出 '# sentences: 42068'

'# sentences: 42068'

In [7]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [8]:
# 为了计算简单，我们只保留在数据集中至少出现5次的词
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
           for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens # 输出 '# tokens: 887100'

'# tokens: 887100'

In [13]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset]) # '# tokens: 376200'

'# tokens: 375279'

In [14]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('the') # '# the: before=50770, after=2013'

'# the: before=50770, after=2101'

In [15]:
compare_counts('join') # '# join: before=45, after=45'

'# join: before=45, after=45'

In [16]:
# 提取中心词和背景词
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [17]:
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1]
center 1 has contexts [0, 2]
center 2 has contexts [1, 3]
center 3 has contexts [2, 4]
center 4 has contexts [2, 3, 5, 6]
center 5 has contexts [4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


## 负采样

In [19]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词。
                # 为了高效计算，可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [20]:
## 读取数据
def batchify(data):
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        center=center.numpy().tolist()
        context=context.numpy().tolist()
        negative=negative.numpy().tolist()
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return tf.data.Dataset.from_tensor_slices((tf.reshape(tf.convert_to_tensor(centers),shape=(-1, 1)), tf.convert_to_tensor(contexts_negatives),
            tf.convert_to_tensor(masks), tf.convert_to_tensor(labels)))

In [21]:
def generator():
    for cent, cont, neg in zip(all_centers,all_contexts,all_negatives):
        yield (cent, cont, neg)

batch_size = 512
dataset=tf.data.Dataset.from_generator(generator=generator,output_types=(tf.int32,tf.int32, tf.int32))
dataset = dataset.apply(batchify).shuffle(len(all_centers)).batch(batch_size)

for batch in dataset:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


KeyboardInterrupt: 

## 跳字模型

In [22]:
# 嵌入层
embed = tf.keras.layers.Embedding(input_dim=20, output_dim=4)
embed.build(input_shape=(1,20))
embed.get_weights()

[array([[-0.02507738, -0.01369326,  0.01640219, -0.00742242],
        [ 0.03018421,  0.04063046, -0.00604192,  0.03191039],
        [ 0.04554789,  0.04168626,  0.01220803,  0.02346456],
        [ 0.01055904, -0.02125403, -0.03358085, -0.01004518],
        [-0.00559691,  0.03274361, -0.02518579, -0.0050643 ],
        [ 0.01338284,  0.03316281, -0.0087778 ,  0.02121096],
        [ 0.01834339, -0.04635412,  0.03149806, -0.04572862],
        [-0.00384893, -0.02024952,  0.01004634, -0.0406546 ],
        [-0.02283722, -0.0446862 , -0.01199006, -0.02620161],
        [-0.0326875 ,  0.01253093, -0.03732591,  0.00347003],
        [-0.03857158, -0.04178375, -0.04371009,  0.00623209],
        [ 0.03880937, -0.00146786, -0.03287393,  0.02225739],
        [ 0.02983563, -0.00144194,  0.00422386, -0.01547488],
        [ 0.03517422, -0.03790203,  0.00538973, -0.02753559],
        [ 0.04717023,  0.01509229, -0.01029483, -0.03325013],
        [ 0.03433505,  0.03404799,  0.00240324, -0.04189185],
        

In [23]:
x = tf.convert_to_tensor([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
embed(x)

# 小批量乘法
X = tf.ones((2, 1, 4))
Y = tf.ones((2, 4, 6))
tf.matmul(X, Y).shape

InternalError: Blas xGEMMBatched launch failed : a.shape=[2,1,4], b.shape=[2,4,6], m=1, n=6, k=4, batch_size=2 [Op:BatchMatMulV2] name: MatMul/

In [None]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = tf.matmul(v, tf.transpose(u,perm=[0,2,1]))
    return pred

class SigmoidBinaryCrossEntropyLoss(tf.keras.losses.Loss):
    def __init__(self): # none mean sum
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def __call__(self, inputs, targets, mask=None):
        #tensorflow中使用tf.nn.weighted_cross_entropy_with_logits设置mask并没有起到作用
        #直接与mask按元素相乘回实现当mask为0时不计损失的效果
        inputs=tf.cast(inputs,dtype=tf.float32)
        targets=tf.cast(targets,dtype=tf.float32)
        mask=tf.cast(mask,dtype=tf.float32)
        res=tf.nn.sigmoid_cross_entropy_with_logits(inputs, targets)*mask
        return tf.reduce_mean(res,axis=1)

loss = SigmoidBinaryCrossEntropyLoss()

pred = tf.convert_to_tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]],dtype=tf.float32)
# 标签变量label中的1和0分别代表背景词和噪声词
label = tf.convert_to_tensor([[1, 0, 0, 0], [1, 1, 0, 0]],dtype=tf.float32)
mask = tf.convert_to_tensor([[1, 1, 1, 1], [1, 1, 1, 0]],dtype=tf.float32)  # 掩码变量
loss(label, pred, mask) * mask.shape[1] / tf.reduce_sum(mask,axis=1)