In [19]:
import tensorflow as tf
import numpy as np
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.layers import Embedding, Dot, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [20]:
# Step 1: 准备数据
corpus = [
    "tensorflow is a deep learning framework",
    "word embeddings are useful in many NLP tasks",
    "skip gram is a word embedding technique",
    "efficient estimation of word representations"
]

In [26]:
# Step 2: 文本预处理 - 将文本转换为数字序列
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)  # 根据语料库进行词汇映射
word2id = tokenizer.word_index  # 词到ID的映射
id2word = {v: k for k, v in word2id.items()}  # ID到词的映射
sequences = tokenizer.texts_to_sequences(corpus)  # 生成序列化后的句子

# Step 3: 定义模型的超参数
vocab_size = len(word2id) + 1  # 词汇表的大小
window_size = 2  # Skip-Gram窗口大小
embedding_dim = 100  # 词向量的维度

# Step 4: 生成 Skip-Gram 训练数据
def generate_training_data(sequences, window_size, vocab_size):
    all_pairs = []
    for tokens in sequences:
        pairs, _ = skipgrams(tokens, vocab_size, window_size=window_size)  # 生成Skip-Gram数据
        all_pairs.extend(pairs)
    return np.array(all_pairs)

pairs = generate_training_data(sequences, window_size, vocab_size)
targets, contexts = pairs[:, 0], pairs[:, 1]  # 提取目标词和上下文词

# Step 5: 创建 TensorFlow 数据集
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices((targets, contexts))  # 构建TensorFlow数据集
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)  # 打乱并分批数据

# Step 6: 构建 Skip-Gram 模型
target_input = tf.keras.layers.Input(shape=(), dtype='int32')  # 目标词输入
context_input = tf.keras.layers.Input(shape=(), dtype='int32')  # 上下文词输入

embedding = Embedding(vocab_size, embedding_dim, name='embedding')  # 嵌入层，显式指定名称

target_embedding = embedding(target_input)  # 获取目标词嵌入向量
context_embedding = embedding(context_input)  # 获取上下文词嵌入向量

dot_product = Dot(axes=-1)([target_embedding, context_embedding])  # 计算点积（相似度分数）

output = Reshape((1,))(dot_product)  # 调整输出形状

skipgram_model = Model([target_input, context_input], output)  # 构建模型
skipgram_model.compile(loss='binary_crossentropy', optimizer=Adam())  # 编译模型

# Step 7: 训练模型
labels = np.ones(len(targets))  # 所有正样本标签为1
skipgram_model.fit([targets, contexts], labels, epochs=10, batch_size=BATCH_SIZE)  # 开始训练模型

# Step 8: 提取训练好的词嵌入
word_embeddings = skipgram_model.get_layer('embedding').get_weights()[0]  # 提取词向量

# Step 9: 查找相似词
def get_similar_words(word, word_embeddings, word2id, id2word, top_n=5):
    word_vec = word_embeddings[word2id[word]]  # 获取指定词的词向量
    sim_scores = np.dot(word_embeddings, word_vec)  # 计算余弦相似度
    sim_ids = np.argsort(sim_scores)[-top_n-1:-1][::-1]  # 找出最相似的词
    return [id2word[i] for i in sim_ids]



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# Example of finding similar words
similar_words = get_similar_words("estimation", word_embeddings, word2id, id2word)
print("Words similar to 'estimation':", similar_words)

Words similar to 'estimation': ['of', 'efficient', 'word', 'framework', 'are']
