### W2V 基于TensorFlow的基础实现

#### 准备阶段

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from tensorflow.contrib.tensorboard.plugins import projector




# Give a folder path as an argument with '--log_dir' to save
# TensorBoard summaries. Default is a log folder in current directory.

current_path = os.path.dirname(os.path.realpath(sys.argv[0]))

parser = argparse.ArgumentParser()
parser.add_argument(
    '--log_dir',
    type=str,
    default=os.path.join(current_path, 'log'),
    help='The log directory for TensorBoard summaries.')
FLAGS, unparsed = parser.parse_known_args()

# Create the directory for TensorBoard variables if there is not.
if not os.path.exists(FLAGS.log_dir):
  os.makedirs(FLAGS.log_dir)

  from ._conv import register_converters as _register_converters


#### 读取数据

In [8]:
# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


# pylint: disable=redefined-outer-name
def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                       local_filename)
    statinfo = os.stat(local_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + local_filename +
                        '. Can you get to it with a browser?')
    return local_filename


# filename = maybe_download('text8.zip', 31344016)


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        # 读取 数据后返回 Unicode 的字符串
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


# vocabulary = read_data(filename)
# vocabulary 返回的是一个数组，里面是单个的分好的词
#print('Data size', len(vocabulary))
#print ("Data Samples",vocabulary[:20])

vocabulary = [
    'anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first',
    'used', 'against', 'early', 'working', 'class', 'radicals', 'including',
    'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the',
    'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the',
    'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to',
    'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to',
    'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also',
    'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self',
    'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived',
    'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king',
    'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief',
    'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished',
    'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what',
    'this', 'means', 'anarchism', 'also', 'refers', 'to', 'related', 'social',
    'movements', 'that', 'advocate', 'the', 'elimination', 'of',
    'authoritarian', 'institutions', 'particularly', 'the', 'state', 'the',
    'word', 'anarchy', 'as', 'most', 'anarchists', 'use', 'it', 'does', 'not',
    'imply', 'chaos', 'nihilism', 'or', 'anomie', 'but', 'rather', 'a',
    'harmonious', 'anti', 'authoritarian', 'society', 'in', 'place', 'of',
    'what', 'are', 'regarded', 'as', 'authoritarian', 'political',
    'structures', 'and', 'coercive', 'economic', 'institutions', 'anarchists',
    'advocate', 'social', 'relations', 'based', 'upon', 'voluntary',
    'association', 'of', 'autonomous', 'individuals', 'mutual', 'aid', 'and',
    'self', 'governance', 'while', 'anarchism', 'is', 'most', 'easily',
    'defined', 'by', 'what', 'it', 'is', 'against', 'anarchists', 'also',
    'offer', 'positive', 'visions', 'of', 'what', 'they', 'believe', 'to',
    'be', 'a', 'truly', 'free', 'society'
]

#### 建立词典

In [12]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    
    # collections.Counter 统计词频，most_common 返回TOP N 的词
    # 在上面的 count 数组后面追加，最后 count 是一个包含了所有词的词频的列表
    
    count.extend(collections.Counter(words).most_common(n_words - 1))

    dictionary = dict()
    
    for word, _ in count:
        
        # 添加字典，给每一次词添加一个编号
        # 最后构建的词典是每一个词，对应其编号
        dictionary[word] = len(dictionary)

    print("构建的常用词的词典：","\n", dictionary)
    
    data = list()
    
    unk_count = 0
    for word in words:
        
        # 得到每一个词对应的编号,如果不在上面的词典中那么就不是高频词，对应的词典就是UNK
        
        index = dictionary.get(word, 0)
        
        
        
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        
        # 得到输入的分好词的 数据的对应的编号数据
        data.append(index)
    
    count[0][1] = unk_count # 把最新统计出来的不是高频词的编号给第一个UNK
    
    # 翻转 上面的 键值对
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


# Filling 4 global variables:
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)

#del vocabulary  # 这里是为了降低内存的占用，如果是小数据量的可以去除
print('Sample count', count[:5])
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

构建的常用词的词典： 
 {'UNK': 0, 'the': 1, 'of': 2, 'a': 3, 'anarchism': 4, 'as': 5, 'is': 6, 'and': 7, 'to': 8, 'anarchists': 9, 'what': 10, 'used': 11, 'that': 12, 'society': 13, 'it': 14, 'also': 15, 'are': 16, 'authoritarian': 17, 'term': 18, 'against': 19, 'revolution': 20, 'in': 21, 'means': 22, 'positive': 23, 'by': 24, 'self': 25, 'defined': 26, 'word': 27, 'political': 28, 'be': 29, 'social': 30, 'advocate': 31, 'institutions': 32, 'most': 33, 'originated': 34, 'abuse': 35, 'first': 36, 'early': 37, 'working': 38, 'class': 39, 'radicals': 40, 'including': 41, 'diggers': 42, 'english': 43, 'sans': 44, 'culottes': 45, 'french': 46, 'whilst': 47, 'still': 48, 'pejorative': 49}
Sample count [['UNK', 73], ('the', 12), ('of', 9), ('a', 6), ('anarchism', 5)]
Most common words (+UNK) [['UNK', 73], ('the', 12), ('of', 9), ('a', 6), ('anarchism', 5)]
Sample data [4, 34, 5, 3, 18, 2, 35, 36, 11, 19] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


#### 创建BATCH

In [13]:
# 下面采用的是skip-gram 的方法

def generate_batch(batch_size, num_skips, skip_window):
    
    global data_index  # 声明使用全局变量
    
    assert batch_size % num_skips == 0  #做类似于 JAVA TRY CATCH 的判断，断言前面两个变量的关系，没有余数
    
    assert num_skips <= 2 * skip_window

    # 声明两个变量，一个是 batch 大小的数组，用来存样本，一个是标签，只是纵向的数组
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    # 创建一个两端开口的管道
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin

    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span  # 滑动窗口

    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)  # 随机采样
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) #num_skip 表示对应的目标数
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
          reverse_dictionary[labels[i, 0]])

34 originated -> 5 as
34 originated -> 4 anarchism
5 as -> 34 originated
5 as -> 3 a
3 a -> 18 term
3 a -> 5 as
18 term -> 2 of
18 term -> 3 a


In [16]:
print (batch)
print (labels)

[34 34  5  5  3  3 18 18]
[[ 5]
 [ 4]
 [34]
 [ 3]
 [18]
 [ 5]
 [ 2]
 [ 3]]


#### 创建模型

In [17]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):

            # 向量的初始化，生成词数和向量维度的随机分布的矩阵
            embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                  1.0))

            # 找到对应输入 batch 的词的索引对应的初始化的向量
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)#真正最后的向量就是使用类似的方法获取

        # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            # 初始化权重矩阵，权重都是正态的分布的
            nce_weights = tf.Variable(
                tf.truncated_normal(
                    [vocabulary_size, embedding_size],
                    stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

# 下面是损失函数，其中 NCE 是 W2V 专用的多分类损失函数
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_biases,
                labels=train_labels,
                inputs=embed,
                num_sampled=num_sampled,
                num_classes=vocabulary_size))

    # Add the loss value as a scalar to summary.
    # 显示损失函数的下降过程
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    # 逆向传播使用的 SGD
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    # 做类似归一化处理
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) #计算分母的值
    normalized_embeddings = embeddings / norm #归一化处理
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    
    print("normalized_embeddings:",normalized_embeddings)
    print("valid_embeddings:",valid_embeddings)
    
    # 矩阵相乘
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    print("similarity:",similarity)
    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    # 最后取的是权重矩阵中的对应的那个词的权重
    saver = tf.train.Saver()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
normalized_embeddings: Tensor("truediv:0", shape=(50, 128), dtype=float32)
valid_embeddings: Tensor("embedding_lookup:0", shape=(16, 128), dtype=float32)
similarity: Tensor("MatMul:0", shape=(16, 50), dtype=float32)


#### 训练模型

In [None]:
# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):

        # 生成 batch 形式的样本
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
                                                    skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        # 可视化 summary
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val = session.run(
            [optimizer, merged, loss],
            feed_dict=feed_dict,
            run_metadata=run_metadata)
        average_loss += loss_val

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            f.write(reverse_dictionary[i] + '\n')

    # Save the model for checkpoints.
    saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
    projector.visualize_embeddings(writer, config)

writer.close()

#### 数据可视化

In [None]:
# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph=graph) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):

        # 生成 batch 形式的样本
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
                                                    skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        # 可视化 summary
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val = session.run(
            [optimizer, merged, loss],
            feed_dict=feed_dict,
            run_metadata=run_metadata)
        average_loss += loss_val

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            f.write(reverse_dictionary[i] + '\n')

    # Save the model for checkpoints.
    saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
    projector.visualize_embeddings(writer, config)

writer.close()