In [1]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

# 使用内置的示例数据 common_texts 进行训练
# common_texts 是一个包含多条句子的列表
# 每个句子都是一个词的列表
sentences = common_texts

# 训练 Word2Vec 模型
model = Word2Vec(
    sentences=sentences,   # 训练数据
    vector_size=100,       # 词向量的维度
    window=5,              # 上下文窗口大小
    min_count=1,           # 忽略所有频率低于此值的词
    workers=4,             # 训练时使用的线程数
    sg=1,                  # 使用Skip-gram训练算法
    hs=0,                  # 不使用层次Softmax，使用负采样
    epochs=10              # 训练迭代次数
)

In [2]:
# 获取某个词的词向量
vector = model.wv['computer']
vector

array([-0.00515814, -0.00667079, -0.0077797 ,  0.00831379, -0.00198307,
       -0.00685749, -0.00415592,  0.00514602, -0.00287019, -0.00375104,
        0.00162202, -0.00277732, -0.00158494,  0.00107489, -0.00297904,
        0.00852242,  0.00391237, -0.00996253,  0.0062619 , -0.00675674,
        0.00076972,  0.00440586, -0.00510525, -0.00211145,  0.00809846,
       -0.00424535, -0.00763907,  0.00926132, -0.00215629, -0.00472117,
        0.00857395,  0.00428492,  0.00432643,  0.00928793, -0.00845619,
        0.00525725,  0.0020401 ,  0.00418982,  0.00169853,  0.00446578,
        0.00448794,  0.00610677, -0.00320328, -0.00457741, -0.00042667,
        0.00253467, -0.00326437,  0.00605995,  0.00415566,  0.00776745,
        0.00257022,  0.00811967, -0.00138772,  0.0080809 ,  0.00371838,
       -0.00805029, -0.00393506, -0.00247279,  0.00489484, -0.00087248,
       -0.00283195,  0.00783659,  0.00932633, -0.00161552, -0.00516115,
       -0.00470349, -0.00484783, -0.00960636,  0.00137253, -0.00

In [3]:
# 查询与某个词最相似的前 5 个词
similar_words = model.wv.most_similar('computer', topn=5)
similar_words

[('system', 0.21617144346237183),
 ('survey', 0.04468921199440956),
 ('interface', 0.015375791117548943),
 ('time', 0.0020086215808987617),
 ('trees', -0.032843150198459625)]

In [5]:
# 计算两个词之间的相似度
similarity = model.wv.similarity('computer', 'human')
similarity

-0.07424271

在 Word2Vec 模型中，相似度是通过余弦相似度来衡量的，取值范围为 -1 到 1。一般来说，相似度值越接近 1，表示两个词越相似；越接近 -1，表示两个词越不相似。绝对值的大小并不能准确反映相似度，应该直接看相似度值本身。

## 拼接测试

In [7]:
sentences = common_texts

# 训练 model1 模型
model1 = Word2Vec(
    sentences=sentences,   # 训练数据
    vector_size=8,       # 词向量的维度
    window=5,              # 上下文窗口大小
    min_count=1,           # 忽略所有频率低于此值的词
    workers=4,             # 训练时使用的线程数
    sg=1,                  # 使用Skip-gram训练算法
    hs=0,                  # 不使用层次Softmax，使用负采样
    epochs=10              # 训练迭代次数
)
# 训练 model2 模型
model2 = Word2Vec(
    sentences=sentences,   # 训练数据
    vector_size=16,       # 词向量的维度
    window=5,              # 上下文窗口大小
    min_count=1,           # 忽略所有频率低于此值的词
    workers=4,             # 训练时使用的线程数
    sg=1,                  # 使用Skip-gram训练算法
    hs=0,                  # 不使用层次Softmax，使用负采样
    epochs=10              # 训练迭代次数
)

In [8]:
model1.wv['human']

array([-0.03884076,  0.08497037,  0.02039345,  0.00237396,  0.04342046,
        0.00272222,  0.12023532,  0.06325755], dtype=float32)

In [9]:
model2.wv['human']

array([-0.04428703, -0.01560663,  0.03214534, -0.02290773, -0.05856287,
        0.02391712,  0.03052799, -0.04017852,  0.00755349, -0.01296798,
        0.00015252, -0.06177193,  0.01682503, -0.02968816,  0.00679779,
       -0.0098514 ], dtype=float32)

现在需要构造一个model3，它同样是一个Word2Vec对象，不需要进行训练，单词的向量为model1与model2的拼接.
> 已知model1与model2的单词相同

In [11]:
# 所有的单词
all_words = model1.wv.index_to_key
all_words

['system',
 'graph',
 'trees',
 'user',
 'minors',
 'eps',
 'time',
 'response',
 'survey',
 'computer',
 'interface',
 'human']

In [16]:
import numpy as np

# 创建一个新的字典存储拼接后的词向量
concatenated_vectors = {}

for word in all_words:
    if word in model2.wv:
        # 拼接两个模型的词向量
        concatenated_vectors[word] = np.concatenate((model1.wv[word], model2.wv[word]))

concatenated_vectors

{'system': array([-0.00670284,  0.00295539,  0.06379187,  0.11261591, -0.11628687,
        -0.08896011,  0.08073591,  0.11216235, -0.00335142,  0.0014777 ,
         0.03189594,  0.05630796, -0.05814344, -0.04448006,  0.04036795,
         0.05608118, -0.03134643, -0.02352107,  0.04612815, -0.0095842 ,
        -0.02835383,  0.04096282, -0.030376  , -0.01135011], dtype=float32),
 'graph': array([-0.06269285, -0.04704215,  0.09225631, -0.01916839, -0.05670767,
         0.08192565, -0.060752  , -0.02270022,  0.01797862,  0.00619921,
        -0.05178259, -0.05905511,  0.04569854,  0.03168914,  0.04223558,
         0.00476791,  0.03969306, -0.02128354, -0.00591501,  0.03605358,
        -0.04701024, -0.02460065, -0.04694739, -0.00581276], dtype=float32),
 'trees': array([ 0.03595725,  0.01239842, -0.10356519, -0.11811022,  0.09139708,
         0.06337827,  0.08447117,  0.00953582,  0.05961324, -0.04574479,
        -0.01458605, -0.01211088,  0.05048398, -0.0370681 ,  0.00028227,
        -0.0297

In [14]:
# 创建一个新的 Word2Vec 模型对象，设置词向量维度为 model1 和 model2 的维度之和
combined_model = Word2Vec(vector_size=model1.vector_size + model2.vector_size, min_count=1)
# 构造词汇表和词向量
combined_model.build_vocab([all_words], update=False)  # 直接从 words 构建词汇表
for word in all_words:
    combined_model.wv[word] =concatenated_vectors[word]

In [15]:
combined_model.wv['human']

array([-0.03884076,  0.08497037,  0.02039345,  0.00237396,  0.04342046,
        0.00272222,  0.12023532,  0.06325755, -0.04428703, -0.01560663,
        0.03214534, -0.02290773, -0.05856287,  0.02391712,  0.03052799,
       -0.04017852,  0.00755349, -0.01296798,  0.00015252, -0.06177193,
        0.01682503, -0.02968816,  0.00679779, -0.0098514 ], dtype=float32)

In [17]:
combined_model.wv['minors']

array([ 0.11922649, -0.09148958, -0.02917211, -0.02422176,  0.10096796,
       -0.0741362 ,  0.00056453, -0.05942167, -0.00942505,  0.01543622,
       -0.00555017,  0.03458539, -0.01714361,  0.01412541,  0.03409871,
        0.05216221, -0.00908588, -0.05755089,  0.02731595,  0.00357366,
        0.04651193, -0.00508302, -0.01649009, -0.05470631], dtype=float32)