In [4]:
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 词向量模型训练
model = Word2Vec(corpus, sg=0, vector_size=300, window=5, min_count=3, workers=4)
#模型显示
print('模型参数：',model,'\n')

模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



In [5]:
#最匹配
print('最匹配的词是：',model.wv.most_similar(positive=['点赞', '不错'], negative=['难吃']),'\n')

最匹配的词是： [('位置', 0.9540828466415405), ('好找', 0.9428838491439819), ('招待', 0.9424126744270325), ('高', 0.9363126754760742), ('团购', 0.9329617619514465), ('挺', 0.9301753044128418), ('价格', 0.9298818111419678), ('灯光', 0.9288555383682251), ('可以', 0.9278314709663391), ('性价比', 0.927315354347229)] 



In [6]:
#语义相似度
print('相似度为=',model.wv.similarity('推荐','好吃'),'\n')

相似度为= 0.81863433 



In [7]:
#坐标返回
print(model.wv.__getitem__('地道'))

[ 1.35703348e-02  1.22677833e-01  1.38020869e-02  6.21715300e-02
 -4.47329879e-02 -8.41619894e-02  9.17025506e-02  2.77095944e-01
  1.20303482e-02 -4.68467064e-02 -7.79428380e-03 -1.12819500e-01
 -3.21396850e-02 -9.29615833e-03 -1.24600761e-01 -4.59714755e-02
  1.07373610e-01  3.52654862e-03  5.31495363e-02 -5.86814545e-02
 -4.93588038e-02 -9.47326142e-03  8.52976553e-03  3.02542616e-02
  7.25650415e-02 -4.07054611e-02 -1.54082224e-01  4.53116708e-02
 -3.05468868e-02 -1.05339810e-01  9.47830305e-02 -5.15808351e-02
  1.77773740e-02 -5.63142262e-03 -7.54123405e-02  1.94546673e-02
  7.68072754e-02 -1.48627937e-01  4.69528846e-02  3.09324376e-02
 -7.72314668e-02  2.54727621e-02  4.00686152e-02 -1.04017988e-01
  6.61017597e-02  1.05344057e-01  4.39798161e-02  2.57632579e-03
 -1.39479211e-03  1.02561019e-01  3.70883085e-02 -1.84263960e-02
 -2.95909662e-02  4.05327231e-02 -2.40861066e-02  1.02712564e-01
  5.30877635e-02  8.75907950e-03  3.17290463e-02  2.07202677e-02
 -5.20491377e-02 -2.97432

In [8]:
model_sg = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)
print('Skip-Gram 模型参数:', model_sg)

Skip-Gram 模型参数: Word2Vec<vocab=4036, vector_size=300, alpha=0.025>


In [9]:
word_vector = model_sg.wv['环境']
print("'环境'的词向量:\n", word_vector)
print("\n词向量形状:", word_vector.shape)

'环境'的词向量:
 [ 0.15315431  0.14262643 -0.03031068  0.23232795 -0.16332182 -0.1830963
  0.00384196  0.36959606 -0.29592523 -0.18213448  0.01841759 -0.09658574
  0.04179713  0.12806967 -0.25404412  0.01876115  0.4834244   0.18326269
  0.18925227 -0.3776035  -0.07657053 -0.19816615 -0.07336395  0.14512284
 -0.11772875  0.07928833 -0.062277    0.04419234 -0.04423282 -0.16473135
  0.20949122  0.05885816  0.08398551  0.16317149 -0.14948703  0.03620358
 -0.02681712 -0.2027599  -0.10531355 -0.1418368   0.05756729 -0.08234941
  0.16588287 -0.09137684  0.00554461  0.24998705 -0.03087207 -0.13904864
 -0.00593499  0.03606149 -0.01250828 -0.1371793  -0.17903632  0.14437746
 -0.05915244  0.13119099 -0.10220528 -0.22243343  0.02767742 -0.1701795
 -0.05168625  0.05734797  0.00217722  0.01875622 -0.16279069  0.17822647
  0.0245349  -0.11402038 -0.30296367 -0.05272479 -0.02026683 -0.10340034
 -0.06860677 -0.14360061  0.29509404  0.00906897  0.06509542  0.165376
 -0.12898435  0.02492286 -0.12682591 -0.0116

In [10]:
similar_words = model_sg.wv.most_similar('好吃', topn=3)
print("与'好吃'最接近的3个词:")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

与'好吃'最接近的3个词:
棒: 0.8489
入味: 0.8446
好看: 0.8395


In [11]:
sim1 = model_sg.wv.similarity('好吃', '美味')
sim2 = model_sg.wv.similarity('好吃', '蟑螂')
print(f"'好吃' vs '美味': {sim1:.4f}")
print(f"'好吃' vs '蟑螂': {sim2:.4f}")

'好吃' vs '美味': 0.8132
'好吃' vs '蟑螂': 0.2842


In [12]:
result = model_sg.wv.most_similar(positive=['餐厅', '聚会'], negative=['安静'], topn=1)
print("向量运算结果:", result[0][0])

向量运算结果: 四号
