# 数据预处理

### 数据读取

In [1]:
# 数据读取
with open("../data/nlp/corpus.zh", "r", encoding="utf-8") as f:
    data = f.read()
data[:20]

'第918(1994)号决议\n1994年5'

### 分词

In [2]:
import jieba

text = jieba.lcut(data, cut_all=False)
# 保存分词结果
with open('../data/nlp/jiebacut_corpus.txt', 'w', encoding='utf-8') as f:
    for word in text:
        f.write(word + " ")
text[:20]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\15710\AppData\Local\Temp\jieba.cache
Loading model cost 0.750 seconds.
Prefix dict has been built successfully.


['第',
 '918',
 '(',
 '1994',
 ')',
 '号',
 '决议',
 '\n',
 '1994',
 '年',
 '5',
 '月',
 '17',
 '日',
 '安全',
 '理事会',
 '第',
 '3377',
 '次',
 '会议']

In [3]:
# 读取jieba分词结果
with open("../data/nlp/jiebacut_corpus.txt", "r", encoding="utf-8") as f:
    jieba_text = f.read().split()
jieba_text[:10]

['第', '918', '(', '1994', ')', '号', '决议', '1994', '年', '5']

In [4]:
# pkuseg
import pkuseg
seg = pkuseg.pkuseg()
text = seg.cut(data)
# 保存分词结果
with open('../data/nlp/pkucut_corpus.txt', 'w', encoding='utf-8') as f:
    for word in text:
        f.write(word + " ")
text[:20]

['第918',
 '(',
 '1994',
 ')',
 '号',
 '决议',
 '1994年',
 '5月',
 '17日',
 '安全',
 '理事会',
 '第3377',
 '次',
 '会议',
 '通过',
 '安全',
 '理事会',
 '，',
 '重申',
 '其']

In [9]:
# 读取pkuseg分词结果
with open("../data/nlp/pkucut_corpus.txt", "r", encoding="utf-8") as f:
    pku_text = f.read().split()
pku_text[:20]

['第918',
 '(',
 '1994',
 ')',
 '号',
 '决议',
 '1994年',
 '5月',
 '17日',
 '安全',
 '理事会',
 '第3377',
 '次',
 '会议',
 '通过',
 '安全',
 '理事会',
 '，',
 '重申',
 '其']

### 词向量处理

In [11]:
# word2vec
from gensim.models import word2vec
#读取分词完的内容
sentences = word2vec.LineSentence('../data/nlp/jiebacut_corpus.txt')
#建立模型并保存
model = word2vec.Word2Vec(sentences, hs=1, min_count=0, window=10,
workers = 8, size = 48)
model.save("word2vec.model")

In [20]:
model.wv['灾害'], model.wv['灾害'].shape

(array([ 0.23652005,  1.9465765 , -1.1562902 ,  0.27276713,  1.5825741 ,
         0.39901152,  1.2242953 , -0.56943536, -2.7857995 ,  2.635947  ,
        -0.8032355 , -1.7854276 ,  2.1169271 ,  3.3969214 , -0.5720065 ,
        -3.4009426 , -0.0127127 ,  0.8965557 , -3.1056638 , -1.3232818 ,
         3.211486  , -0.20034076, -0.38924745, -0.62086755,  0.622817  ,
         0.24139549, -4.5820293 ,  2.6790357 , -1.2922032 ,  0.5360245 ,
        -0.50222313, -0.08984615,  1.426506  , -1.1321039 , -1.461951  ,
         2.7812805 , -0.4808427 ,  0.3624135 ,  0.0428284 , -0.70098644,
        -1.4316238 , -0.3896738 , -3.4013562 , -0.6208475 ,  0.10607259,
        -1.3858671 , -1.1950605 ,  3.2356174 ], dtype=float32),
 (48,))

In [15]:
model.wv.most_similar('灾害')

[('自然灾害', 0.7891435623168945),
 ('洪涝', 0.7375470995903015),
 ('紧急状况', 0.7248141765594482),
 ('灾难', 0.7082797288894653),
 ('预警系统', 0.7012507319450378),
 ('预防', 0.6894747614860535),
 ('危机', 0.6718776226043701),
 ('紧急情况', 0.669623613357544),
 ('旱灾', 0.6694502830505371),
 ('预警', 0.6675460338592529)]

In [47]:
# word2vec
from gensim.models import word2vec
#读取分词完的内容
pku_sentences = word2vec.LineSentence('../data/nlp/pkucut_corpus.txt')
#建立模型并保存
pku_model = word2vec.Word2Vec(pku_sentences, hs=1, min_count=0, window=15,
workers = 12, size =100) 
pku_model.save("pku_word2vec.model")

In [48]:
pku_model.wv.most_similar('灾害')

[('自然灾害', 0.6323012113571167),
 ('灾难', 0.583943784236908),
 ('备灾', 0.5665409564971924),
 ('减灾', 0.5581703186035156),
 ('资源', 0.5259444713592529),
 ('资源部', 0.5209581851959229),
 ('PERES', 0.5066176652908325),
 ('地震', 0.5033663511276245),
 ('灾情', 0.49321162700653076),
 ('风险', 0.48520219326019287)]

### 四个关联词汇

In [36]:
# a - b + c = d
result = pku_model.wv.most_similar(positive=['领导人', '国家'], negative=['资金'])
most_similar_key, similarity = result[0]
print(f"{most_similar_key}: {similarity:.4f}")

反对党: 0.7027


### 计算句子语义向量

In [73]:
def avgPool(model, wordlist):
    N = len(wordlist)
    Emb = np.zeros_like(model.wv["灾害"])
    for word in wordlist:
        if word == "":
            N -= 1
            continue
        Emb += model.wv[word]
    Emb /= N
    return Emb

In [41]:
sample_Sentense = ['重申', '其', '以往', '关于', '卢旺达', '局势', '的', '所有', '决议']
Emb = avgPool(model, sample_Sentense)
Emb

array([-0.10874759, -1.5761362 ,  0.9564036 ,  1.2508222 , -0.9867184 ,
        0.12698399, -1.0839667 ,  0.6916187 , -0.7233998 ,  1.7226443 ,
       -0.5590294 , -0.68959093,  0.14335287,  0.06274685, -0.29186594,
        0.4385254 ,  0.75533104,  0.62798214, -1.3407363 , -1.3550838 ,
       -0.16126677, -0.1541384 ,  0.33694676, -0.2889706 , -0.6018766 ,
        0.3219061 , -0.09348576, -0.03662695, -0.19886282,  0.98672336,
       -0.94969624, -0.34999803,  1.7753211 , -2.037934  , -0.36911476,
       -0.28571108,  0.45379665,  1.7084594 , -0.22545439, -0.20104669,
        0.6365651 ,  0.51377445,  1.1794941 , -0.16202115, -0.5112079 ,
       -0.3655927 , -1.6378613 ,  0.46409267], dtype=float32)

In [56]:
# 文档分句
import re
# 获取文档
with open("../data/nlp/corpus.zh", "r", encoding="utf-8") as f:
    doc = f.read()
    
seg = pkuseg.pkuseg(model_name='news')

# 分句
sentences = re.split('。|！|\!|\.|？|\?|,|，|;|；', doc)
cut_sentences = []

# 对每一句分词
for s in sentences:
    cut_sentences.append(seg.cut(s))
cut_sentences[0]

['第918(', '1994)号', '决议', '1994年5月17日', '安全理事会', '第3377次', '会议', '通过', '安全理事会']

In [68]:
# 去除标点符号
from tqdm import tqdm
remove_chars = '[·’!"\#$%&\'()＃！（）*+,-./:;<=>?\@，：?￥★、…．＞【】［］《》？“”‘’\[\\]^_`{|}~]+'
with open('../data/nlp/pkucut_corpus.txt', 'w', encoding='utf-8') as f:
    for i in tqdm(range(len(cut_sentences))):
        for j in range(len(cut_sentences[i])):
            cut_sentences[i][j] = re.sub(remove_chars, "", cut_sentences[i][j])
            f.write(cut_sentences[i][j] + " ")
cut_sentences[0]

100%|███████████████████████████████████████████████████████████| 1064417/1064417 [00:21<00:00, 50379.92it/s]


['第918', '1994号', '决议', '1994年5月17日', '安全理事会', '第3377次', '会议', '通过', '安全理事会']

In [69]:
#读取分词完的内容
pku_sentences = word2vec.LineSentence('../data/nlp/pkucut_corpus.txt')
#建立模型并保存
pku_model = word2vec.Word2Vec(pku_sentences, hs=1, min_count=0, window=15,
workers = 12, size =100) 
pku_model.save("pku_word2vec.model")

In [74]:
from numpy import dot
from numpy.linalg import norm

# 计算余弦相似度
def cal_cos_similarity(a, b):
    return (dot(a, b)/(norm(a)*norm(b)))
# 样本句子
sample = seg.cut("中华人民共和国中央人民政府今天成立了")
sample_v = avgPool(pku_model, sample)
print(sample)
# 找出分好的句子中，句向量与样本句余弦相似度最大的句子索引max_index
max_index = 0
max_similarity = 0
current_index = 0
for s in tqdm(cut_sentences):
#     print(s)
    current_similarity = cal_cos_similarity(avgPool(pku_model, s), sample_v)
    if current_similarity > max_similarity:
        max_index = current_index
        max_similarity = current_similarity
    current_index += 1
print(max_index)

['中华人民共和国', '中央', '人民政府', '今天', '成立', '了']


  if __name__ == '__main__':
100%|███████████████████████████████████████████████████████████| 1064417/1064417 [01:09<00:00, 15371.88it/s]

347028





In [75]:
# 输出余弦相似度最为相近的句子
cut_sentences[max_index]

['中央政府', '成立', '了', '全国就业委员会']