# 调库

In [1]:
import json
# 遍历文档用的
import os
from rake_nltk import Rake
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
# 提取关键词用的
import yake
# 将中文符号转换成英文符号
import unicodedata

# 预处理

In [2]:
def json_print(text):
    # 格式化输出，缩进4个单位
    print(json.dumps(text, sort_keys = True, indent = 4))

In [3]:
def read_data_from_dir(dirname):
    # 文档内容集合
    text = []
    # 遍历文档
    for root, dirs, files in os.walk(dirname):
        for file in files:
            # print(file)
            # 获取文件名
            filename = os.path.splitext(file)[0]
            # 读取文件
            content = json.load(open(root + '/' + file, 'r', encoding = 'utf-8-sig'))
            # 文件名添加到文件中，方便后续生成中间件
            content['FileName'] = filename
            text.append(content)
        
    return text

In [22]:
# 读取新闻数据，json格式
text = read_data_from_dir('news_1')

In [23]:
# 获取词向量
# 该词向量文件形式为：词 空格 词向量，然后换行
# 从http://nlp.stanford.edu/data/glove.6B.zip获取GloVe
word_embeddings = {}
GLOVE_DIR = 'glove.6B.100d.txt'
with open(GLOVE_DIR, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        word_embeddings[word] = coefs

# 提取关键词

In [5]:
def yake_it(text):
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 10

    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    return keywords

In [6]:
# 将每一篇的关键短语都提取出来
def extract_key_phrases_from_doc(docs):
    doc_phrases, phrases_list = [], []
    for doc in docs:
        doc['Text'] = unicodedata.normalize('NFKC', doc['Text'])
        key_phrases_dict = yake_it(doc['Text'])
        key_phrases_list = []
        for tur in key_phrases_dict:
            key_phrases_list.append(tur[0])
            phrases_list.append(tur[0])
        doc_phrases.append(key_phrases_list)
    return doc_phrases, phrases_list

In [9]:
# 转换出（短语，编号）的字典
def list_to_dict(phrases):
    phrases = list(set(phrases))
    num_phrases = len(phrases)
    phrases_dict = {}
    for i in range(num_phrases):
        phrases_dict[phrases[i]] = i
    return phrases_dict

In [12]:
doc_phrases, phrases_list = extract_key_phrases_from_doc(text[:50])
phrases_dict = list_to_dict(phrases_list)

# 计算共现矩阵

In [33]:
# 计算共现
num_phrases = len(phrases_list)
co_occurance = [[0] * num_phrases for _ in range(num_phrases)]
for doc in doc_phrases:
    for i in doc:
        for j in doc:
            if i != j:
                co_occurance[phrases_dict[i]][phrases_dict[j]] += 1
                co_occurance[phrases_dict[j]][phrases_dict[i]] += 1

# 计算相似度矩阵

In [24]:
def phrase_glove(phrase):
    words = phrase.split()
    vector = np.array([0.0 for _ in range(len(word_embeddings['hello']))], dtype = 'float32')
    bias = 0
    for word in words:
        if word not in word_embeddings:
            bias += 1
            continue
        vector += word_embeddings[word]
    if bias == len(words):
        return vector
    return vector / (len(words) - bias)

In [45]:
# 余弦相似度
def CosineSimilarity(x, y):
    sqrt_x, sqrt_y = np.sqrt((x ** 2).sum()), np.sqrt((y ** 2).sum())
    if sqrt_x == 0 or sqrt_y == 0:
        return 0.0
    return (x * y).sum() / (sqrt_x * sqrt_y)

In [46]:
similarity = np.zeros((num_phrases, num_phrases))
for i in phrases_list:
    for j in phrases_list:
        if i != j:
            simi = CosineSimilarity(phrase_glove(i), phrase_glove(j))
            similarity[phrases_dict[i]][phrases_dict[j]] = simi
            similarity[phrases_dict[j]][phrases_dict[i]] = simi

# 计算最终相似度

In [49]:
# 最终相似度
sim = np.zeros((num_phrases, num_phrases))
# 计算共现的时候重复统计了
co = np.array(co_occurance)
co = co / 2
for i in phrases_list:
    for j in phrases_list:
        if i != j:
            simi = CosineSimilarity(co[phrases_dict[i]], co[phrases_dict[j]])
            sim[phrases_dict[i]][phrases_dict[j]] = simi
            sim[phrases_dict[j]][phrases_dict[i]] = simi

In [51]:
alpha = 0.2
sim = sim * alpha + similarity * (1 - alpha)

In [53]:
sim[0], similarity[0]

(array([ 0.        ,  0.24336648,  0.04202476,  0.00211336,  0.15895565,
         0.27523818,  0.31652367,  0.23055151, -0.03053458,  0.21129603,
         0.17362046,  0.2029398 ,  0.3345742 ,  0.07015957,  0.3426125 ,
         0.94090367,  0.07438333,  0.11778713,  0.34272827,  0.28457   ,
         0.26726246,  0.26264063,  0.16363585,  0.28741751,  0.29638   ,
         0.18000197,  0.10975375,  0.28848507,  0.50481849, -0.07842185,
         0.31643256,  0.18451941,  0.19753032,  0.21848681,  0.20729039,
         0.17187773,  0.11662841,  0.12688725,  0.40858231,  0.3127867 ,
         0.36048775,  0.37489405,  0.42576275,  0.32868516,  0.18664904,
         0.00360747,  0.44157858,  0.22257433,  0.30735629,  0.235708  ,
         0.08348819,  0.28527911,  0.29953525,  0.23330133,  0.30381281,
         0.19800456,  0.13729062,  0.07517336,  0.26457062,  0.26356158,
         0.26469834,  0.32673757,  0.33795412,  0.64879923,  0.34408746,
         0.32868342,  0.32827899,  0.19490544,  0.1