# 11.3实战：动手写Word2Vec

#### 实验说明：该代码实现的应用比较简单，利用Tensorflow2.x框架搭建Skip-gram模型以实现文本向量化，并计算输出与测试词最相近的10个词。


## 1. 导入依赖模块

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import jieba
import re
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.manifold import TSNE

## 2. 读取停用词

#### 加载停用词表，去掉停用词中的回车、换行符、空格，打印停用词的长度

In [2]:
stop_words=[]
with open("/home/Word2VecTest/data/stop_words.txt", "r", encoding="utf-8") as f_stopwords:
    for line in f_stopwords:
        line = line.replace("\r", "").replace("\n", "").strip()
        stop_words.append(line)
print(len(stop_words))
stop_words=set(stop_words)
print(len(stop_words))

1910
1909


## 3. 文本预处理

####     首先定义一个列表用于存储分完词的文本列表，然后读取文本数据集，去掉文本中的回车、换行符、空格，接着进行分词处理，并对分词后结果进行停用词过滤。

In [3]:
raw_word_list = []
rules = u"([\u4e00-\u9fa5]+)"
pattern = re.compile(rules)
f_writer = open("/home/Word2VecTest/data/Seg_The_Smiling_Proud_Wanderer.txt", "w", encoding="utf-8")
with open("/home/Word2VecTest/data//The_Smiling_Proud_Wanderer.txt", "r", encoding="utf-8") as f_reader:
    lines = f_reader.readlines()
    for line in lines:
        line = line.replace("\r", "").replace("\n", "").strip()
        if line == "" or line is None:
            continue
        line = " ".join(jieba.cut(line))
        seg_list = pattern.findall(line)
        word_list = []
        for word in seg_list:
            if word not in stop_words:
                word_list.append(word)
        if len(word_list) > 0:
            raw_word_list.extend(word_list)
            line = " ".join(word_list)
            # line=" ".join(seg_list)
            f_writer.write(line + "\n")
            f_writer.flush()
f_writer.close()
print(len(raw_word_list))
print(len(set(raw_word_list)))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.778 seconds.
Prefix dict has been built successfully.


272481
41043


## 4. 文本编码，通过汉字找到相应的编码，通过编码找到相应的汉字

#### 根据编码找到相应的词

In [None]:
vocabulary_size = len(set(raw_word_list))
words = raw_word_list
count =[['UNK', '-1']]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
print("count",len(count))
dictionary = dict()

for word, _ in count:
    dictionary[word]=len(dictionary)
data=list()
unk_count = 0
for word in words:
    if word in dictionary:
        index = dictionary[word]
    else:
        index = 0
        unk_count = unk_count+1
    data.append(index)
count[0][1] = unk_count

reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
del words
print(reverse_dictionary[1000])
print(data[:200])

count 41043
江南
[1983, 13891, 20228, 20229, 20230, 7412, 4195, 13892, 20231, 20232, 20233, 13893, 13894, 478, 1810, 10701, 5699, 20234, 5700, 20235, 427, 2433, 10701, 906, 20236, 7413, 20237, 20238, 20239, 3332, 2781, 8739, 251, 2954, 20240, 8740, 7414, 2093, 7415, 10702, 4615, 2434, 1984, 13895, 2094, 3861, 10703, 7416, 13896, 13897, 598, 10703, 8741, 20241, 10703, 509, 1260, 1030, 4615, 2434, 20242, 20243, 132, 20244, 2434, 367, 203, 510, 20245, 13898, 13899, 13900, 4196, 13901, 7417, 839, 5701, 2186, 2435, 20246, 7418, 20247, 10704, 20248, 367, 203, 510, 20249, 2187, 20250, 20251, 105, 6485, 20252, 20253, 5702, 20254, 1622, 10705, 8742, 100, 689, 13902, 13903, 6486, 485, 20255, 2619, 74, 7419, 1559, 587, 1622, 100, 330, 48, 10706, 839, 36, 203, 10707, 4616, 3333, 1811, 20256, 839, 200, 2782, 690, 213, 2783, 20257, 20258, 3334, 4617, 20259, 5091, 551, 3135, 8743, 324, 132, 3335, 10708, 1984, 13904, 1119, 2784, 2785, 4618, 13905, 3565, 2436, 2436, 2188, 2955, 434, 1887, 10709, 20260, 1

## 5. 模型搭建与测试

#### 这部分不仅仅包含Skip-gram模型的搭建，还包括参数设置：skip_window设置为2，其为单词最远可以联系的距离，每个单词生成样本数设置为4，也就是num_skips为4，batch_size设置为128...,然后进行测试

In [None]:
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    #声明全局变量
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1

    #对某个单词创建相关样本时使用到的单词数量
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch,labels
batch, labels = generate_batch(batch_size=128 , num_skips=4 , skip_window=2)

for i in range(10):
    print(batch[i], reverse_dictionary[batch[i]], "-->", labels[i, 0], reverse_dictionary[labels[i, 0]])

#skip-gram model
batch_size = 128  
embedding_size = 300
skip_window = 2  #
num_skips = 4  
valid_window = 100
num_sample = 64
learning_rate = 0.01

#校验集
valid_word = ['令狐冲', '左冷禅', '林平之', '岳不群', '桃根仙']
valid_example = [dictionary[li] for li in valid_word]

#定义skip-gram网络结构
data_index = 0

# 为skip-gram模型生成训练批次
def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    # 得到窗口长度( 当前单词左边和右边 + 当前单词)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1

    #回溯一点，以避免在批处理结束时跳过单词
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

# 确保在CPU上分配以下操作和变量
# (某些操作在GPU上不兼容)
with tf.device('/cpu:0'):
    # 创建嵌入变量（每一行代表一个词嵌入向量） embedding vector).
    embedding = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))
    # 构造NCE损失的变量
    nce_weights = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

def get_embedding(x):
    with tf.device('/cpu:0'):
       # 对于X中的每一个样本查找对应的嵌入向量
        x_embed = tf.nn.embedding_lookup(embedding, x)
        return x_embed

def nce_loss(x_embed, y):
    with tf.device('/cpu:0'):
        # 计算批处理的平均NCE损失
        y = tf.cast(y, tf.int64)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=y,
                           inputs=x_embed,
                           num_sampled=num_sample,
                           num_classes=vocabulary_size))
        return loss

# 评估
def evaluate(x_embed):
    with tf.device('/cpu:0'):
         # 计算输入数据嵌入与每个嵌入向量之间的余弦相似度
        x_embed = tf.cast(x_embed, tf.float32)
        x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))
        embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)
        cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)
        return cosine_sim_op


# 定义优化器
optimizer = tf.optimizers.SGD(learning_rate)

# 优化过程
def run_optimization(x, y):
    with tf.device('/cpu:0'):
       # 将计算封装在GradientTape中以实现自动微分
        with tf.GradientTape() as g:
            emb = get_embedding(x)
            loss = nce_loss(emb, y)

        # 计算梯度
        gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])

         # 按gradients更新 W 和 b
        optimizer.apply_gradients(zip(gradients, [embedding, nce_weights, nce_biases]))


# 用于测试的单词
x_test = np.array(valid_example)
num_steps = 2000000
avg_loss = 0
# 针对给定步骤数进行训练
for step in range(num_steps):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
    run_optimization(batch_inputs, batch_labels)
    loss = nce_loss(get_embedding(batch_inputs), batch_labels)
    avg_loss = avg_loss + loss

    if step % 5000 == 0:
        if step > 0:
            avg_loss = avg_loss / 5000
        loss = nce_loss(get_embedding(batch_inputs), batch_labels)
        print("step: %i, loss: %f" % (step, loss))
        # print("平均损失在", num_steps, "中为：", avg_loss)

    # 计算验证集合的相似度
    if step % 10000 == 0:
        sim = evaluate(get_embedding(x_test)).numpy()
        for i in range(len(valid_word)):
            val_word = reverse_dictionary[valid_example[i]]
            top_k = 10
            nearest = (-sim[i, :]).argsort()[1:top_k+1]
            sim_str = "与" + val_word + "最近的前10词是"
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                sim_str = "%s %s," % (sim_str, close_word)
            print(sim_str)


20228 金庸 --> 13891 作者
20228 金庸 --> 20229 灭门
20228 金庸 --> 1983 笑傲江湖
20228 金庸 --> 20230 风熏
20229 灭门 --> 20228 金庸
20229 灭门 --> 7412 柳
20229 灭门 --> 20230 风熏
20229 灭门 --> 13891 作者
20230 风熏 --> 20228 金庸
20230 风熏 --> 4195 花香
step: 0, loss: 605.019836
与令狐冲最近的前10词是 禅院, 余一片, 报答, 一手交货, 真有假, 五十两, 中加, 退后, 一了百了, 有定,
与左冷禅最近的前10词是 很长, 收殓, 大有分别, 绞索, 老起, 陷愈, 驻足, 小兵, 此剑, 左肘,
与林平之最近的前10词是 滋补, 黑狱, 单请, 本寺, 辰, 性命不保, 造反派, 起伏不定, 易发见, 食物,
与岳不群最近的前10词是 苏, 稍加, 石几, 邻家, 冲进去, 立地成佛, 有道是, 倚傍, 同在, 尝尝,
与桃根仙最近的前10词是 以小人之心, 上下左右, 俘虏, 刮去, 长城, 越众, 从练, 乐队, 把手, 括,
step: 5000, loss: 489.685852
step: 10000, loss: 423.761383
与令狐冲最近的前10词是 岳不群, 弟子, 田伯光, 师父, 决在, 中加, 盈盈, 允, 林平之, 受伤,
与左冷禅最近的前10词是 很长, 绞索, 收殓, 陷愈, 此剑, 大有分别, 小兵, 学到, 好人, 疯子,
与林平之最近的前10词是 滋补, 造反派, 令狐冲, 黑狱, 土语, 六个, 结亲, 来处, 气助, 摇摇欲坠,
与岳不群最近的前10词是 令狐冲, 他点, 苏, 石几, 以强凌弱, 激得, 稍加, 招斗, 听得见, 中任击,
与桃根仙最近的前10词是 以小人之心, 上下左右, 俘虏, 长城, 从练, 把手, 一蒸, 刮去, 越众, 道袍,
step: 15000, loss: 374.860016
step: 20000, loss: 403.257141
与令狐冲最近的前10词是 弟子, 岳不群, 师父, 剑法, 岳灵珊, 盈盈, 林平之, 瞧, 恒山, 田伯光,
与左冷禅最近的前10词是 