In [35]:
import tensorflow as tf
import os
import sklearn.datasets
import numpy as np
import re
import collections
import random
from sklearn import metrics
import jieba
import sklearn

In [33]:
# ipynb多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [37]:
# 用于输出依赖包对应的版本
jieba.__version__
tf.__version__
sklearn.__version__
np.__version__
# '0.39'
# '1.9.0'
# '0.21.2'
# '1.16.2'

'0.39'

'1.9.0'

'0.21.2'

'1.16.2'

In [2]:
# 写入停用词
with open(r'stopwords.txt','r') as f:
    english_stopwords = f.read().split('\n')

In [4]:
def separate_dataset(trainset, ratio = 0.5):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        # 提取每一条文本数据，并过滤None值行文本；
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        # 打乱某类样本顺序；
        data_ = random.sample(data_, int(len(data_) * ratio))
        # 去除停用词
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        # 提取所有的词
        datastring += data_
        # 为每一个样本补上标签
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

def clearstring(string):
    # 清洗样本，并去停用词
    # 去除非中文字符
    string = re.sub(r'^[\u4e00-\u9fa5a-zA-Z0-9]', '', string)
    string = list(jieba.cut(string, cut_all=False))
    string = filter(None, string)
    string = [y.strip() for y in string if y.strip() not in english_stopwords]
    string = ' '.join(string)
    return string.lower()


def str_idx(corpus, dic, maxlen, UNK = 3):
    # 词典索引
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X


In [5]:
trainset = sklearn.datasets.load_files(container_path = 'dataset', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset,1.0)
print(trainset.target_names)
print(len(trainset.data))
print(len(trainset.target))

['.ipynb_checkpoints', 'Chinese']
14342
14342


In [7]:
import collections

# 构建词语词典
def build_dataset(words, n_words, atleast=1):
    # 四种填充词
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    # 过滤那些只有一个字的字符
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    # 构建词的索引
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    for word in words:
        # 如果字典中没有出现的词，用unk表示
        index = dictionary.get(word, 3)
        data.append(index)
    # 翻转字典
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reversed_dictionary

In [8]:
split = (' '.join(trainset.data)).split()
# 去重后的所有单词集合，组成词典
vocabulary_size = len(list(set(split)))
# data为所有词的词索引，词典，反向词典
data, dictionary, rev_dictionary = build_dataset(split, vocabulary_size)

In [9]:
len(dictionary)

45533

In [10]:
# 构建字符级词典
def build_char_dataset(words):
    # 四种填充词
    count = []
    dictionary = dict()
    # 构建词的索引
    for word in words:
        dictionary[word] = len(dictionary)
    data = list()
    for word in words:
        # 如果字典中没有出现的词，用unk表示
        index = dictionary.get(word, 3)
        data.append(index)
    # 翻转字典
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, dictionary, reversed_dictionary

In [11]:
# 构建所有中文汉字的字符级别的字典,3912个汉字
char_split = list(set(list(''.join(trainset.data))))
length = len(char_split)
char_data, char_dictionary, char_rev_dictionary = build_char_dataset(char_split)

In [16]:
# 给中文词语编码
class Vocabulary:
    def __init__(self, dictionary, rev_dictionary):
        self._dictionary = dictionary
        self._rev_dictionary = rev_dictionary
    
    # 起始符
    @property
    def start_string(self):
        return self._dictionary['GO']
    
    # 结束符
    @property
    def end_string(self):
        return self._dictionary['EOS']
    
    # 未知单词
    @property
    def unk(self):
        return self._dictionary['UNK']

    @property
    def size(self):
        return len(self._dictionary)
    
    # 查询词语的数值索引
    def word_to_id(self, word):
        return self._dictionary.get(word, self.unk)
    
    # 通过索引反查词语
    def id_to_word(self, cur_id):
        return self._rev_dictionary.get(cur_id, self._rev_dictionary[3])
    
    # 将数字索引解码成字符串并拼接起来
    def decode(self, cur_ids):
        return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
    
    # 将字符串编码成数字索引
    def encode(self, sentence, reverse = False, split = True):

        if split:
            sentence = sentence.split()
        # 将文本转化为数字索引
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
        
        # 为所有的文本加上起始符和结束符，双向编码都支持
        if reverse:
            return np.array(
                [self.end_string] + word_ids + [self.start_string],
                dtype = np.int32,
            )
        else:
            return np.array(
                [self.start_string] + word_ids + [self.end_string],
                dtype = np.int32,
            )


In [17]:
# 英文，数字，字符用自制转码，中文字符，从0开始进行编码,考虑到中文短语的长度一般不超过8，故可取max_length=10
class UnicodeCharsVocabulary(Vocabulary):
    def __init__(self, dictionary, rev_dictionary,char_dictionary, char_rev_dictionary, max_word_length, **kwargs):
        super(UnicodeCharsVocabulary, self).__init__(
            dictionary, rev_dictionary, **kwargs
        )
        # 最大单词长度
        self._max_word_length = max_word_length
        self._char_dictionary = char_dictionary
        self._char_rev_dictionary = char_rev_dictionary
        self.bos_char = 3912
        self.eos_char = 3913
        self.bow_char = 3914
        self.eow_char = 3915
        self.pad_char = 3916
        self.unk_char = 3917
        # 单词的数量
        num_words = self.size
        
        # 构建字符级别的词典表，[num_words,max_word_length]
        self._word_char_ids = np.zeros(
            [num_words, max_word_length], dtype = np.int32
        )
        
        # 构建bos和eos的mask，初始化一个_max_word_length的张量，全部用3916填充，第一个字符位用3914，第三个字符位用3915，
        # 第二个字符作为输入进行传入
        def _make_bos_eos(c):
            r = np.zeros([self._max_word_length], dtype = np.int32)
            r[:] = self.pad_char
            r[0] = self.bow_char
            r[1] = c
            r[2] = self.eow_char
            return r
        
        # 张量化
        self.bos_chars = _make_bos_eos(self.bos_char)
        self.eos_chars = _make_bos_eos(self.eos_char)
        
        # 遍历字典中的每个单词，并将每个单词都进行字符级别的编码
        for i, word in enumerate(self._dictionary.keys()):
            self._word_char_ids[i] = self._convert_word_to_char_ids(word)
        # 对于起始符GO和结束符EOS进行编码
        self._word_char_ids[self.start_string] = self.bos_chars
        self._word_char_ids[self.end_string] = self.eos_chars

    @property
    def word_char_ids(self):
        return self._word_char_ids

    @property
    def max_word_length(self):
        return self._max_word_length
    
    # 将单词转化为字符级别的索引
    def _convert_word_to_char_ids(self, word):
        # 对输入的单词进行张量化，用3916，pad进行全数填充
        code = np.zeros([self.max_word_length], dtype = np.int32)
        code[:] = self.pad_char
        # 截取maxlen-2个字符,并将所有字符转化为自定义字符集

        word_encoded = [self._char_dictionary.get(item,self.unk_char) for item in list(word)][:(self.max_word_length - 2)]
        # 第一个字符位为3914
        code[0] = self.bow_char
        # 遍历单词的每一个字符,k从1开始
        for k, chr_id in enumerate(word_encoded, start = 1):
            code[k] = chr_id
        # 在单词的末尾补充一个单词末尾结束符3915
        code[len(word_encoded) + 1] = self.eow_char
        return code
    
    # 将词语转化为自定义字符编码
    def word_to_char_ids(self, word):
        if word in self._dictionary:
            return self._word_char_ids[self._dictionary[word]]
        else:
            return self._convert_word_to_char_ids(word)
        
    # 将句子转化为自定义字符编码
    def encode_chars(self, sentence, reverse = False, split = True):
        if split:
            sentence = sentence.split()
        chars_ids = [self.word_to_char_ids(cur_word) for cur_word in sentence]

        if reverse:
            return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
        else:
            return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])


def _get_batch(generator, batch_size, num_steps, max_word_length):
    # generator: 生成器
    # batch_size: 每个批次的字符串的数量
    # num_steps: 窗口大小
    # max_word_length: 最大单词长度，一般设置为50
    # 初始化batch_size个字符串
    cur_stream = [None] * batch_size

    no_more_data = False
    while True:
        # 初始化词语矩阵[batch_size,num_steps]
        inputs = np.zeros([batch_size, num_steps], np.int32)
        # 初始化字符级矩阵
        if max_word_length is not None:
            char_inputs = np.zeros(
                [batch_size, num_steps, max_word_length], np.int32
            )
        else:
            char_inputs = None
        # 初始化预测词语的矩阵[batch_size,num_steps]
        targets = np.zeros([batch_size, num_steps], np.int32)
        for i in range(batch_size):
            cur_pos = 0
            while cur_pos < num_steps:
                if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
                    try:
                        # 每一步都获取词索引，字符集编码器
                        cur_stream[i] = list(next(generator))
                    except StopIteration:
                        no_more_data = True
                        break
                # how_many 取当前总num_steps与文本词向量数量的较小值，累加
                how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
                next_pos = cur_pos + how_many
                
                # 赋值输入对应的词索引范围和字符级别索引范围
                inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
                if max_word_length is not None:
                    char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][
                        :how_many
                    ]
                # targets 我们的目标是预测下一个词来优化emlo，所以我们以向右滑动的1个词作为target，作为预测对象
                targets[i, cur_pos:next_pos] = cur_stream[i][0][
                    1 : how_many + 1
                ]

                cur_pos = next_pos
                
                # 处理完之前那段，重新处理下一段，每段的长度取决于howmany,这里是window的宽度。
                cur_stream[i][0] = cur_stream[i][0][how_many:]
                if max_word_length is not None:
                    cur_stream[i][1] = cur_stream[i][1][how_many:]

        if no_more_data:
            break

        X = {
            'token_ids': inputs,
            'tokens_characters': char_inputs,
            'next_token_id': targets,
        }

        yield X


class LMDataset:
    def __init__(self, string, vocab, reverse = False):
        self._vocab = vocab
        self._string = string
        self._reverse = reverse
        self._use_char_inputs = hasattr(vocab, 'encode_chars')
        self._i = 0
        # 总文本的数量
        self._nids = len(self._string)

    def _load_string(self, string):
        if self._reverse:
            string = string.split()
            string.reverse()
            string = ' '.join(string)
        
        # 将一段文本解析成词索引，会在起始和末尾增加一个标志位
        ids = self._vocab.encode(string, self._reverse)
        
        # 将一段文本解析成字符级编码
        if self._use_char_inputs:
            chars_ids = self._vocab.encode_chars(string, self._reverse)
        else:
            chars_ids = None
        # 返回由词索引和字符集编码的元组
        return list(zip([ids], [chars_ids]))[0]
    
    # 生成器，循环生成每个样本的词索引和字符编码
    def get_sentence(self):
        while True:
            if self._i == self._nids:
                self._i = 0
            ret = self._load_string(self._string[self._i])
            self._i += 1
            yield ret

    @property
    def max_word_length(self):
        if self._use_char_inputs:
            return self._vocab.max_word_length
        else:
            return None
        
    # batch生成器，每次只拿batch_size个数据，要多少数据就即时处理多少数据
    def iter_batches(self, batch_size, num_steps):
        for X in _get_batch(
            self.get_sentence(), batch_size, num_steps, self.max_word_length
        ):
            yield X

    @property
    def vocab(self):
        return self._vocab

# 双向编码
class BidirectionalLMDataset:
    def __init__(self, string, vocab):
        # 正向编码和反向编码
        self._data_forward = LMDataset(string, vocab, reverse = False)
        self._data_reverse = LMDataset(string, vocab, reverse = True)

    def iter_batches(self, batch_size, num_steps):
        max_word_length = self._data_forward.max_word_length

        for X, Xr in zip(
            _get_batch(
                self._data_forward.get_sentence(),
                batch_size,
                num_steps,
                max_word_length,
            ),
            _get_batch(
                self._data_reverse.get_sentence(),
                batch_size,
                num_steps,
                max_word_length,
            ),
        ):
            # 拼接成一个6个item的字典，前三个为正向，后三个为反向
            for k, v in Xr.items():
                X[k + '_reverse'] = v

            yield X

In [18]:
# maxlens=10，很明显没有超过8的词语，其中两个用来做填充符
uni = UnicodeCharsVocabulary(dictionary, rev_dictionary,char_dictionary,char_rev_dictionary, 10)

In [21]:
bi = BidirectionalLMDataset(trainset.data, uni)

In [23]:
# 每次只输入16个样本数据
batch_size = 16
# 训练用的词典大小
n_train_tokens = len(dictionary)
# 语言模型参数配置项
options = {
    # 开启双向编码机制
    'bidirectional': True,
    # 字符级别的CNN，字符级别词嵌入128维，一共配置7种类型的滤波器，每个词最大长度为50，编码的有效数量为3918个，设置两条高速通道
    'char_cnn': {
        'activation': 'relu',
        'embedding': {'dim': 128},
        'filters': [
            [1, 32],
            [2, 32],
            [3, 64],
            [4, 128],
            [5, 256],
            [6, 512],
            [7, 1024],
        ],
        'max_characters_per_token': 10,
        'n_characters': 3918,
        'n_highway': 2,
    },
    # 随机失活率设置为0.1
    'dropout': 0.1,
    # lstm单元，设置三层，嵌入维度为512维
    'lstm': {
        # 截断值
        'cell_clip': 3,
        'dim': 512,
        'n_layers': 2,
        'projection_dim': 256,
        # 裁剪到[-3,3]之间
        'proj_clip': 3,
        'use_skip_connections': True,
    },
    # 一共迭代100轮
    'n_epochs': 100,
    # 训练词典的大小
    'n_train_tokens': n_train_tokens,
    # 每个batch的大小
    'batch_size': batch_size,
    # 所有词的数量
    'n_tokens_vocab': uni.size,
    # 推断区间为20
    'unroll_steps': 20,
    'n_negative_samples_batch': 0.001,
    'sample_softmax': True,
    'share_embedding_softmax': False,
}

In [24]:
# 构建ELMO语言模型
class LanguageModel:
    def __init__(self, options, is_training):
        self.options = options
        self.is_training = is_training
        self.bidirectional = options.get('bidirectional', False)

        self.char_inputs = 'char_cnn' in self.options

        self.share_embedding_softmax = options.get(
            'share_embedding_softmax', False
        )
        if self.char_inputs and self.share_embedding_softmax:
            raise ValueError(
                'Sharing softmax and embedding weights requires ' 'word input'
            )

        self.sample_softmax = options.get('sample_softmax', False)
        # 建立模型
        self._build()
        # 配置学习率
        lr = options.get('learning_rate', 0.2)
        # 配置优化器
        self.optimizer = tf.train.AdagradOptimizer(
            learning_rate = lr, initial_accumulator_value = 1.0
        ).minimize(self.total_loss)

    def _build_word_embeddings(self):
        # 建立词嵌入
        # 加载所有的词
        n_tokens_vocab = self.options['n_tokens_vocab']
        batch_size = self.options['batch_size']
        # 上下文推断的窗口大小，这里关联20个单词
        unroll_steps = self.options['unroll_steps']
        # 词嵌入维度128
        projection_dim = self.options['lstm']['projection_dim']
        # 词索引
        self.token_ids = tf.placeholder(
            tf.int32, shape = (None, unroll_steps), name = 'token_ids'
        )
        self.batch_size = tf.shape(self.token_ids)[0]
        with tf.device('/cpu:0'):
            
            # 对单词进行256维的单词编码，初始化数据服从(-1,1)的正态分布
            self.embedding_weights = tf.get_variable(
                'embedding',
                [n_tokens_vocab, projection_dim],
                dtype = tf.float32,
                initializer = tf.random_uniform_initializer(-1.0, 1.0),
            )
            # 20个词对应的词嵌入
            self.embedding = tf.nn.embedding_lookup(
                self.embedding_weights, self.token_ids
            )
        
        # 启用双向编码机制
        if self.bidirectional:
            self.token_ids_reverse = tf.placeholder(
                tf.int32,
                shape = (None, unroll_steps),
                name = 'token_ids_reverse',
            )
            with tf.device('/cpu:0'):
                self.embedding_reverse = tf.nn.embedding_lookup(
                    self.embedding_weights, self.token_ids_reverse
                )

    def _build_word_char_embeddings(self):

        batch_size = self.options['batch_size']
        unroll_steps = self.options['unroll_steps']
        projection_dim = self.options['lstm']['projection_dim']

        cnn_options = self.options['char_cnn']
        filters = cnn_options['filters']
        # 求和所有的滤波器数量
        n_filters = sum(f[1] for f in filters)
        # 最大单词字符长度
        max_chars = cnn_options['max_characters_per_token']
        # 字符级别嵌入维度，128
        char_embed_dim = cnn_options['embedding']['dim']
        # 所有字符的类型，一共261种
        n_chars = cnn_options['n_characters']
        
        # 配置激活函数
        if cnn_options['activation'] == 'tanh':
            activation = tf.nn.tanh
        elif cnn_options['activation'] == 'relu':
            activation = tf.nn.relu
        
        # [batch_size,unroll_steps,max_chars]
        self.tokens_characters = tf.placeholder(
            tf.int32,
            shape = (None, unroll_steps, max_chars),
            name = 'tokens_characters',
        )
        self.batch_size = tf.shape(self.tokens_characters)[0]
        with tf.device('/cpu:0'):
            # 字符级别词嵌入，嵌入维度128维
            self.embedding_weights = tf.get_variable(
                'char_embed',
                [n_chars, char_embed_dim],
                dtype = tf.float32,
                initializer = tf.random_uniform_initializer(-1.0, 1.0),
            )
            self.char_embedding = tf.nn.embedding_lookup(
                self.embedding_weights, self.tokens_characters
            )

            if self.bidirectional:
                self.tokens_characters_reverse = tf.placeholder(
                    tf.int32,
                    shape = (None, unroll_steps, max_chars),
                    name = 'tokens_characters_reverse',
                )
                self.char_embedding_reverse = tf.nn.embedding_lookup(
                    self.embedding_weights, self.tokens_characters_reverse
                )
                
        # 构建卷积层网络，用于字符级别的CNN卷积
        def make_convolutions(inp, reuse):
            with tf.variable_scope('CNN', reuse = reuse) as scope:
                convolutions = []
                # 这里构建7层卷积网络
                for i, (width, num) in enumerate(filters):
                    if cnn_options['activation'] == 'relu':
                        w_init = tf.random_uniform_initializer(
                            minval = -0.05, maxval = 0.05
                        )
                    elif cnn_options['activation'] == 'tanh':
                        w_init = tf.random_normal_initializer(
                            mean = 0.0,
                            stddev = np.sqrt(1.0 / (width * char_embed_dim)),
                        )
                    w = tf.get_variable(
                        'W_cnn_%s' % i,
                        [1, width, char_embed_dim, num],
                        initializer = w_init,
                        dtype = tf.float32,
                    )
                    b = tf.get_variable(
                        'b_cnn_%s' % i,
                        [num],
                        dtype = tf.float32,
                        initializer = tf.constant_initializer(0.0),
                    )
                    # 卷积，uroll_nums,characters_nums采用1*1，1*2，...，1*7的卷积核，采用valid卷积策略；
                    # width上，(uroll_nums-1/1)+1=uroll_nums
                    # height上，(characters_nums-7/1)+1，捕捉词与词之间的相关性
                    conv = (
                        tf.nn.conv2d(
                            inp, w, strides = [1, 1, 1, 1], padding = 'VALID'
                        )
                        + b
                    )
                    # 最大池化，每个词的字符编码
                    conv = tf.nn.max_pool(
                        conv,
                        [1, 1, max_chars - width + 1, 1],
                        [1, 1, 1, 1],
                        'VALID',
                    )
                    conv = activation(conv)
                    # 删除第三维，输入为[batch_size,uroll_nums,1,nums]
                    # 输出为[batch_size,uroll_nums,nums]
                    conv = tf.squeeze(conv, squeeze_dims = [2])
                    
                    # 收集每个卷积层，并进行拼接
                    convolutions.append(conv)

            return tf.concat(convolutions, 2)

        reuse = tf.get_variable_scope().reuse
        # inp [batch_size,uroll_nums,characters_nums,embedding_size]
        embedding = make_convolutions(self.char_embedding, reuse)
        # [batch_size,20,2048] #经过验证无误
        # 增加一维[1,batch_size,uroll_nums,nums++]
        self.token_embedding_layers = [embedding]
        if self.bidirectional:
            embedding_reverse = make_convolutions(
                self.char_embedding_reverse, True
            )
        # 高速网络的数量
        n_highway = cnn_options.get('n_highway')
        use_highway = n_highway is not None and n_highway > 0
        # use_proj 为True
        use_proj = n_filters != projection_dim
        
        # 本来已经第三维是2048维了，确保数据输出格式
        if use_highway or use_proj:
            embedding = tf.reshape(embedding, [-1, n_filters])
            if self.bidirectional:
                embedding_reverse = tf.reshape(
                    embedding_reverse, [-1, n_filters]
                )

        if use_proj:
            # 使用投影，将滤波器再投影到一个projection_dim维的向量空间内
            assert n_filters > projection_dim
            with tf.variable_scope('CNN_proj') as scope:
                W_proj_cnn = tf.get_variable(
                    'W_proj',
                    [n_filters, projection_dim],
                    initializer = tf.random_normal_initializer(
                        mean = 0.0, stddev = np.sqrt(1.0 / n_filters)
                    ),
                    dtype = tf.float32,
                )
                b_proj_cnn = tf.get_variable(
                    'b_proj',
                    [projection_dim],
                    initializer = tf.constant_initializer(0.0),
                    dtype = tf.float32,
                )
        # 高速通道网络
        def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
            carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
            transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
            return carry_gate * transform_gate + (1.0 - carry_gate) * x

        if use_highway:
            # 高速网络的维度为2048维
            highway_dim = n_filters

            for i in range(n_highway):
                with tf.variable_scope('CNN_high_%s' % i) as scope:
                    W_carry = tf.get_variable(
                        'W_carry',
                        [highway_dim, highway_dim],
                        initializer = tf.random_normal_initializer(
                            mean = 0.0, stddev = np.sqrt(1.0 / highway_dim)
                        ),
                        dtype = tf.float32,
                    )
                    b_carry = tf.get_variable(
                        'b_carry',
                        [highway_dim],
                        initializer = tf.constant_initializer(-2.0),
                        dtype = tf.float32,
                    )
                    W_transform = tf.get_variable(
                        'W_transform',
                        [highway_dim, highway_dim],
                        initializer = tf.random_normal_initializer(
                            mean = 0.0, stddev = np.sqrt(1.0 / highway_dim)
                        ),
                        dtype = tf.float32,
                    )
                    b_transform = tf.get_variable(
                        'b_transform',
                        [highway_dim],
                        initializer = tf.constant_initializer(0.0),
                        dtype = tf.float32,
                    )

                embedding = high(
                    embedding, W_carry, b_carry, W_transform, b_transform
                )
                if self.bidirectional:
                    embedding_reverse = high(
                        embedding_reverse,
                        W_carry,
                        b_carry,
                        W_transform,
                        b_transform,
                    )
                # 扩展一层和两层经过高速网络的参数
                self.token_embedding_layers.append(
                    tf.reshape(
                        embedding, [self.batch_size, unroll_steps, highway_dim]
                    )
                )
        
        # 经过一层线性变换[bacth_size,unroll_nums,projection_dim]
        if use_proj:
            embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
            if self.bidirectional:
                embedding_reverse = (
                    tf.matmul(embedding_reverse, W_proj_cnn) + b_proj_cnn
                )
            # 只经过线性变换的网络参数
            self.token_embedding_layers.append(
                tf.reshape(
                    embedding, [self.batch_size, unroll_steps, projection_dim]
                )
            )
        
        # 确保矩阵尺寸相同
        if use_highway or use_proj:
            shp = [self.batch_size, unroll_steps, projection_dim]
            embedding = tf.reshape(embedding, shp)
            if self.bidirectional:
                embedding_reverse = tf.reshape(embedding_reverse, shp)
                
        # 经过线性变化的embdedding [bacth_size,unroll_nums,projection_dim]
        # self.token_embedding_layers 由四个嵌入层参数组成
        # [bacth_size,unroll_nums,nums++] 原始词嵌入
        # [bacth_size,unroll_nums,highway_dim] 经过第一层高速网络的词嵌入
        # [bacth_size,unroll_nums,highway_dim] 经过第二层高速网络的词嵌入
        # [bacth_size,unroll_nums,projection_dim] 经过低微线性投影的词嵌入
        # print(embedding)
        # print(self.token_embedding_layers)
        self.embedding = embedding
        if self.bidirectional:
            self.embedding_reverse = embedding_reverse
    
    # 构建模型
    def _build(self):
        # 所有词的数量
        n_tokens_vocab = self.options['n_tokens_vocab']
        batch_size = self.options['batch_size']
        # window长度
        unroll_steps = self.options['unroll_steps']
        
        # lstm编码长度
        lstm_dim = self.options['lstm']['dim']
        projection_dim = self.options['lstm']['projection_dim']
        # lstm的层数
        n_lstm_layers = self.options['lstm'].get('n_layers', 1)
        dropout = self.options['dropout']
        # 保有率
        keep_prob = 1.0 - dropout
        
        # 如果是字符级别的输入，则建立词，字符嵌入，否则建立词嵌入，实际上使用前者
        if self.char_inputs:
            self._build_word_char_embeddings()
        else:
            self._build_word_embeddings()
        
        # 存储lstm的状态
        self.init_lstm_state = []
        self.final_lstm_state = []
        
        # 双向
        # lstm_inputs单元为[batch_size,uroll_nums,projection_dim]双向单元
        if self.bidirectional:
            lstm_inputs = [self.embedding, self.embedding_reverse]
        else:
            lstm_inputs = [self.embedding]

        cell_clip = self.options['lstm'].get('cell_clip')
        proj_clip = self.options['lstm'].get('proj_clip')

        
        use_skip_connections = self.options['lstm'].get('use_skip_connections')
        print(lstm_inputs)
        lstm_outputs = []
        for lstm_num, lstm_input in enumerate(lstm_inputs):
            lstm_cells = []
            for i in range(n_lstm_layers):
                # 在进行LSTM编码后再接入一个num_proj的全连接层，[batch_size,projection_dim]
                # [batch_size,num_proj]
                lstm_cell = tf.nn.rnn_cell.LSTMCell(
                    # 隐含层的单元数
                    lstm_dim,
                    num_proj = lstm_dim // 2,
                    cell_clip = cell_clip,
                    proj_clip = proj_clip,
                )

                if use_skip_connections:
                    if i == 0:
                        pass
                    else:
                        # 将上一个单元的输出，和当前输入映射到下一个单元
                        lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell)
                
                # 添加随机失活层
                if self.is_training:
                    lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                        lstm_cell, input_keep_prob = keep_prob
                    )

                lstm_cells.append(lstm_cell)
            
            # 构建多层LSTM
            if n_lstm_layers > 1:
                lstm_cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells)
            else:
                lstm_cell = lstm_cells[0]

            with tf.control_dependencies([lstm_input]):
                # 初始化状态
                self.init_lstm_state.append(
                    lstm_cell.zero_state(self.batch_size, tf.float32)
                )
                if self.bidirectional:
                    with tf.variable_scope('RNN_%s' % lstm_num):
                        # 从最后一步开始，获取最后一步的输出，和最终的隐含状态,确保正反向LSTM单元可以拼接起来
                        _lstm_output_unpacked, final_state = tf.nn.static_rnn(
                            lstm_cell,
                            # 将每个词对应的张量进行分离并作为LSTM的输入
                            tf.unstack(lstm_input, axis = 1),
                            initial_state = self.init_lstm_state[-1],
                        )
                else:
                    _lstm_output_unpacked, final_state = tf.nn.static_rnn(
                        lstm_cell,
                        tf.unstack(lstm_input, axis = 1),
                        initial_state = self.init_lstm_state[-1],
                    )
                self.final_lstm_state.append(final_state)
            # [batch_size,num_proj]
#             print(final_state)
            # 将一个隐含层的输出拼接起来 [batch_size,20,256]
            lstm_output_flat = tf.reshape(
                tf.stack(_lstm_output_unpacked, axis = 1), [-1, projection_dim]
            )
            print(lstm_output_flat)
            tf.add_to_collection(
                'lstm_output_embeddings', _lstm_output_unpacked
            )

            lstm_outputs.append(lstm_output_flat)
        self._build_loss(lstm_outputs)
    
    # 构建损失函数
    def _build_loss(self, lstm_outputs):
        batch_size = self.options['batch_size']
        unroll_steps = self.options['unroll_steps']
        
        # 所有词的数量
        n_tokens_vocab = self.options['n_tokens_vocab']

        def _get_next_token_placeholders(suffix):
            name = 'next_token_id' + suffix
            id_placeholder = tf.placeholder(
                tf.int32, shape = (None, unroll_steps), name = name
            )
            return id_placeholder

        self.next_token_id = _get_next_token_placeholders('')
        # 每次抽取[batch_size,unroll_nums]个词
        print(self.next_token_id)
        if self.bidirectional:
            self.next_token_id_reverse = _get_next_token_placeholders(
                '_reverse'
            )
        # softmax的维度为projection_dim（256）
        softmax_dim = self.options['lstm']['projection_dim']
        # 与词嵌入的权重共享
        if self.share_embedding_softmax:
            self.softmax_W = self.embedding_weights

        # 初始化softmax的参数
        with tf.variable_scope('softmax'), tf.device('/cpu:0'):
            softmax_init = tf.random_normal_initializer(
                0.0, 1.0 / np.sqrt(softmax_dim)
            )
            # softmax分布到每一个词中
            if not self.share_embedding_softmax:
                self.softmax_W = tf.get_variable(
                    'W',
                    [n_tokens_vocab, softmax_dim],
                    dtype = tf.float32,
                    initializer = softmax_init,
                )
            self.softmax_b = tf.get_variable(
                'b',
                [n_tokens_vocab],
                dtype = tf.float32,
                initializer = tf.constant_initializer(0.0),
            )

        self.individual_losses = []

        if self.bidirectional:
            next_ids = [self.next_token_id, self.next_token_id_reverse]
        else:
            next_ids = [self.next_token_id]
        
        self.output_scores = tf.identity(lstm_outputs, name = 'softmax_score')
        
        for id_placeholder, lstm_output_flat in zip(next_ids, lstm_outputs):
            next_token_id_flat = tf.reshape(id_placeholder, [-1, 1])
            with tf.control_dependencies([lstm_output_flat]):
                if self.is_training and self.sample_softmax:
                    losses = tf.nn.sampled_softmax_loss(
                        self.softmax_W,
                        self.softmax_b,
                        next_token_id_flat,
                        lstm_output_flat,
                        int(
                            self.options['n_negative_samples_batch']
                            * self.options['n_tokens_vocab']
                        ),
                        self.options['n_tokens_vocab'],
                        num_true = 1,
                    )

                else:
                    output_scores = (
                        tf.matmul(
                            lstm_output_flat, tf.transpose(self.softmax_W)
                        )
                        + self.softmax_b
                    )

                    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits = self.output_scores,
                        labels = tf.squeeze(
                            next_token_id_flat, squeeze_dims = [1]
                        ),
                    )

            self.individual_losses.append(tf.reduce_mean(losses))

        if self.bidirectional:
            self.total_loss = 0.5 * (
                self.individual_losses[0] + self.individual_losses[1]
            )
        else:
            self.total_loss = self.individual_losses[0]

In [26]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = LanguageModel(options, True)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Use the `axis` argument instead
[<tf.Tensor 'Reshape_5:0' shape=(?, 20, 256) dtype=float32>, <tf.Tensor 'Reshape_6:0' shape=(?, 20, 256) dtype=float32>]
Tensor("Reshape_7:0", shape=(?, 256), dtype=float32)
Tensor("Reshape_8:0", shape=(?, 256), dtype=float32)
Tensor("next_token_id:0", shape=(?, 20), dtype=int32)
[<tf.Tensor 'Reshape_7:0' shape=(?, 256) dtype=float32>, <tf.Tensor 'Reshape_8:0' shape=(?, 256) dtype=float32>]
Tensor("softmax_score:0", shape=(2, ?, 256), dtype=float32)


In [27]:
from tqdm import tqdm

def _get_feed_dict_from_X(X, model, char_inputs, bidirectional):
    feed_dict = {}
    if not char_inputs:
        token_ids = X['token_ids']
        feed_dict[model.token_ids] = token_ids
    else:
        char_ids = X['tokens_characters']
        feed_dict[model.tokens_characters] = char_ids
    if bidirectional:
        if not char_inputs:
            feed_dict[model.token_ids_reverse] = X['token_ids_reverse']
        else:
            feed_dict[model.tokens_characters_reverse] = X['tokens_characters_reverse']
    next_id_placeholders = [[model.next_token_id, '']]
    if bidirectional:
        next_id_placeholders.append([model.next_token_id_reverse, '_reverse'])

    for id_placeholder, suffix in next_id_placeholders:
        name = 'next_token_id' + suffix
        feed_dict[id_placeholder] = X[name]

    return feed_dict

In [28]:
bidirectional = options.get('bidirectional', False)
batch_size = options['batch_size']
unroll_steps = options['unroll_steps']
n_train_tokens = options.get('n_train_tokens')
n_tokens_per_batch = batch_size * unroll_steps
n_batches_per_epoch = int(n_train_tokens / n_tokens_per_batch)
n_batches_total = options['n_epochs'] * n_batches_per_epoch

init_state_tensors = model.init_lstm_state
final_state_tensors = model.final_lstm_state

char_inputs = 'char_cnn' in options
if char_inputs:
    max_chars = options['char_cnn']['max_characters_per_token']
    feed_dict = {
        model.tokens_characters: np.zeros(
            [batch_size, unroll_steps, max_chars], dtype = np.int32
        )
    }

else:
    feed_dict = {model.token_ids: np.zeros([batch_size, unroll_steps])}

if bidirectional:
    if char_inputs:
        feed_dict.update(
            {
                model.tokens_characters_reverse: np.zeros(
                    [batch_size, unroll_steps, max_chars], dtype = np.int32
                )
            }
        )
    else:
        feed_dict.update(
            {
                model.token_ids_reverse: np.zeros(
                    [batch_size, unroll_steps], dtype = np.int32
                )
            }
        )

init_state_values = sess.run(init_state_tensors, feed_dict = feed_dict)

In [29]:
data_gen = bi.iter_batches(batch_size, unroll_steps)
pbar = tqdm(range(n_batches_total), desc = 'train minibatch loop')
for p in pbar:
    batch = next(data_gen)
    feed_dict = {t: v for t, v in zip(init_state_tensors, init_state_values)}
    feed_dict.update(_get_feed_dict_from_X(batch, model, char_inputs, bidirectional))
    score, loss, _, init_state_values = sess.run([model.output_scores,
                                           model.total_loss, model.optimizer, final_state_tensors],
            feed_dict = feed_dict)
    pbar.set_postfix(cost = loss)

train minibatch loop:   0%|          | 35/14200 [00:34<3:57:25,  1.01s/it, cost=5.21]

KeyboardInterrupt: 

In [17]:
word_embed = model.softmax_W.eval()

In [18]:
from scipy.spatial.distance import cdist
from sklearn.neighbors import NearestNeighbors

In [19]:
word = '杨过'
nn = NearestNeighbors(10, metric = 'cosine').fit(word_embed)
distances, idx = nn.kneighbors(word_embed[dictionary[word]].reshape((1, -1)))
word_list = []
for i in range(1, idx.shape[1]):
    word_list.append([rev_dictionary[idx[0, i]], 1 - distances[0, i]])
word_list

[['comedy', 0.42140477895736694],
 ['film', 0.3776022791862488],
 ['one', 0.3622022271156311],
 ['story', 0.36113226413726807],
 ['watch', 0.3598611354827881],
 ['movie', 0.3553805351257324],
 ['like', 0.354833722114563],
 ['almost', 0.3544100522994995],
 ['time', 0.35355472564697266]]