# 调库

In [1]:
import json
# 遍历文档用的
import os
import numpy as np
import pandas as pd
# 提取关键词用的
import yake
# 将中文符号转换成英文符号
import unicodedata
import warnings
warnings.filterwarnings("ignore")

# 预处理

In [5]:
# 获取词向量
# 该词向量文件形式为：词 空格 词向量，然后换行
# 从http://nlp.stanford.edu/data/glove.6B.zip获取GloVe
word_embeddings = {}
GLOVE_DIR = 'glove.840B.300d.txt'
with open(GLOVE_DIR, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        try:
            word = values[0]
            coefs = np.asarray(values[1:], dtype = 'float32')
            word_embeddings[word] = coefs
        except:
            continue

In [6]:
def json_print(text):
    # 格式化输出，缩进4个单位
    print(json.dumps(text, sort_keys = True, indent = 4))

In [7]:
def read_data_from_dir(dirname):
    # 文档内容集合
    text = []
    # 遍历文档
    for root, dirs, files in os.walk(dirname):
        for file in files:
            # print(file)
            # 获取文件名
            filename = os.path.splitext(file)[0]
            # 读取文件
            content = json.load(open(root + '/' + file, 'r', encoding = 'utf-8-sig'))
            # 文件名添加到文件中，方便后续生成中间件
            content['FileName'] = filename
            content['Text'] = unicodedata.normalize('NFKC', content['Text'])
            content['Text'] = content['Text'].lower()
            content['Headline'] = unicodedata.normalize('NFKC', content['Headline'])
            content['Headline'].replace(' – Manila Bulletin', '')
            text.append(content)
        
    return text

In [8]:
# 读取新闻数据，json格式
text = read_data_from_dir('news_1')
data = pd.DataFrame(text)
data.drop('Type', axis=1)

Unnamed: 0,Time,Headline,Text,Section,Writers,URL,MainKeyWord,AdditionalKeyWord,Source,FileName
0,2018-03-22,China says to have ‘prudent’ oil exploration w...,By ReutersChina will prudently advance coopera...,,,https://mb.com.ph/2018/03/22/china-says-to-hav...,China,Philippines,MB,0056d0f89b5c4e3f439700cd9ad227a4
1,2020-06-10,PH to build more cellular sites on Pagasa Isla...,By Martin SadongdongAfter receiving several “w...,,,https://mb.com.ph/2020/06/10/ph-to-build-more-...,China,viet Nam,MB,008661b734849123aea3b047872b56c1
2,2020-03-30,March 30 coronavirus news,Our live coverage of the coronavirus pandemic ...,world,"[Amy Woodyatt, Julia Hollingsworth, Ben Westco...",https://www.cnn.com/world/live-news/coronaviru...,Hong Kong,Philippines,CNN,00969f304b601889e5e8c7ef6cc794e7
3,2018-06-17,Malaysia power shift hits China infrastructure...,By Agence France-PresseMalaysia was once a loy...,,,https://mb.com.ph/2018/06/17/malaysia-power-sh...,China,Malaysia,MB,00a66477d6b91987e00affd4ca3f7ff9
4,2020-06-08,"5 things to know for June 8: George Floyd, pol...",Feeling hopeless? Burnt out? You're not alone....,us,[AJ Willingham],https://www.cnn.com/2020/06/08/us/five-things-...,China,Malaysia,CNN,00b596b867e63662c66ebfbb09da2020
...,...,...,...,...,...,...,...,...,...,...
978,2020-05-04,May 4 coronavirus news,Our live coverage of the coronavirus pandemic ...,world,"[Ben Westcott, Adam Renton]",https://www.cnn.com/world/live-news/coronaviru...,China,Malaysia,CNN,fe8157d39ff8c8d3cf3e248159a85b0d
979,2019-01-10,Surgeries in Mexico linked to antibiotic-resis...,Nearly a dozen Americans who had surgery in Ti...,health,[Sandee LaMotte],https://www.cnn.com/2019/01/10/health/mexico-s...,Taiwan,Thailand,CNN,fef8577dfb7dd78f1e0d0d6c150c0fec
980,2020-06-15,Philippines journalist Maria Ressa found guilt...,Embattled Philippines journalist Maria Ressa w...,media,[James Griffiths],https://www.cnn.com/2020/06/14/asia/maria-ress...,Hong Kong,Southeast Asia,CNN,ff06c9d4421059c3bb27b330fc70d88d
981,2021-02-19,Jack Ma's Ant Group was the next big thing. No...,Jack Ma's Ant Group quickly became one of Chin...,tech,[Laura He],https://www.cnn.com/2021/02/19/tech/ant-group-...,Hong Kong,Singapore,CNN,ff5e31eed953acf427da23e7929ee72d


# 提取关键词

In [9]:
def yake_it(text):
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 10

    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    return keywords

In [10]:
# 将每一篇的关键短语都提取出来
def extract_key_phrases_from_doc(docs):
    doc_phrases, phrases_list = [], []
    for doc in docs:
        key_phrases_dict = yake_it(doc['Text'])
        key_phrases_list = []
        for tur in key_phrases_dict:
            key_phrases_list.append(tur[0])
            phrases_list.append(tur[0])
        doc_phrases.append(key_phrases_list)
    return doc_phrases, phrases_list

In [11]:
# 转换出（短语，编号）的字典
def list_to_dict(phrases):
    phrases = list(set(phrases))
    num_phrases = len(phrases)
    phrases_dict = {}
    for i in range(num_phrases):
        phrases_dict[phrases[i]] = i
    return phrases_dict

In [12]:
doc_phrases, phrases_list = extract_key_phrases_from_doc(text[:50])
phrases_dict = list_to_dict(phrases_list)

In [13]:
len(phrases_dict), len(phrases_list)

(408, 500)

In [35]:
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
raw_sentences = text[0]['Text']
raw_sentences=re.sub('[^\w ]','',raw_sentences)
stop_words = set(stopwords.words('english'))
sentences = word_tokenize(raw_sentences)
filtered_sentence = [w for w in sentences if not w in stop_words]
print(filtered_sentence)
model = word2vec.Word2Vec([filtered_sentence], min_count=0, size=300)

['By', 'ReutersChina', 'prudently', 'advance', 'cooperation', 'Philippines', 'joint', 'oil', 'gas', 'exploration', 'South', 'China', 'Sea', 'Chinas', 'top', 'diplomat', 'State', 'Councillor', 'Wang', 'Yi', 'said', 'Wednesday', 'meeting', 'Philippine', 'counterpartAny', 'potential', 'deals', 'Manila', 'Beijing', 'energy', 'exploration', 'disputed', 'waterway', 'agreed', 'company', 'Chinese', 'government', 'senior', 'Philippine', 'official', 'said', 'earlier', 'monthChina', 'claims', 'South', 'China', 'Sea', 'key', 'trade', 'route', 'home', 'areas', 'believed', 'hold', 'large', 'quantities', 'oil', 'natural', 'gas', 'Along', 'China', 'parts', 'South', 'China', 'Sea', 'subject', 'competing', 'claims', 'Brunei', 'Malaysia', 'Taiwan', 'Vietnam', 'PhilippinesThe', 'two', 'countries', 'February', 'agreed', 'set', 'special', 'panel', 'work', 'jointly', 'explore', 'offshore', 'oil', 'gas', 'areas', 'sides', 'claim', 'without', 'needing', 'address', 'touchy', 'issue', 'sovereigntySpeaking', 'rep

In [39]:
word_embeddings['Taiwan']

array([-1.3169e-01, -4.1897e-05,  2.1975e-01, -5.4363e-02,  3.4959e-01,
       -5.4818e-01, -1.8401e-01, -1.0084e-01, -3.6446e-01,  1.4349e+00,
       -3.2335e-01,  2.6324e-02, -2.8534e-01, -3.0921e-01,  1.8766e-01,
        4.1917e-01,  1.6906e-01,  8.3040e-01, -6.2310e-02,  1.3154e+00,
        2.3716e-01,  7.7824e-01, -1.6357e-01,  1.9785e-02, -2.2135e-01,
        9.2446e-02,  3.4851e-01, -2.5466e-01, -3.9799e-01, -4.6014e-01,
        7.7712e-02, -1.6014e-01, -2.1430e-01,  1.3029e-01, -3.6807e-01,
        1.4595e-01,  4.2635e-01, -1.2070e-01,  1.6188e-01, -3.2804e-01,
       -6.2205e-01, -3.3889e-01,  2.5716e-01,  5.3756e-01, -9.0782e-02,
       -2.4842e-01, -2.2408e-01, -5.8552e-01, -3.0836e-01, -4.4198e-01,
       -3.0276e-01, -5.5635e-02, -4.5307e-01, -1.6125e-01, -4.1672e-01,
        5.9773e-02,  2.3015e-01, -1.8052e-01,  3.4047e-01, -1.0978e-01,
       -5.3616e-01, -2.4845e-01,  9.2782e-02, -2.0760e-02, -1.0390e-02,
        1.3451e-02, -3.3654e-01,  9.5273e-01, -3.5308e-01, -3.00

# 计算共现矩阵

In [11]:
# 计算共现
num_phrases = len(phrases_dict)
co_occurance = np.array([[0] * num_phrases for _ in range(num_phrases)])
for doc in doc_phrases:
    for i in doc:
        for j in doc:
            if i != j:
                co_occurance[phrases_dict[i]][phrases_dict[j]] += 1
                co_occurance[phrases_dict[j]][phrases_dict[i]] += 1
co_occurance.shape

(5497, 5497)

# 计算相似度矩阵

In [12]:
def phrase_glove(phrase):
    words = phrase.split()
    vector = np.array([0.0 for _ in range(len(word_embeddings['hello']))], dtype = 'float32')
    bias = 0
    for word in words:
        if word not in word_embeddings:
            bias += 1
            continue
        vector += word_embeddings[word]
    if bias == len(words):
        return vector
    return vector / (len(words) - bias)

In [13]:
# 余弦相似度
def CosineSimilarity(x, y):
    sqrt_x, sqrt_y = np.sqrt((x ** 2).sum()), np.sqrt((y ** 2).sum())
    if sqrt_x == 0 or sqrt_y == 0:
        return 0.0
    return (x * y).sum() / (sqrt_x * sqrt_y)

In [None]:
sim_glove = np.zeros((num_phrases, num_phrases))
for i in phrases_list:
    for j in phrases_list:
        if i != j:
            simi = CosineSimilarity(phrase_glove(i), phrase_glove(j))
            sim_glove[phrases_dict[i]][phrases_dict[j]] = simi
            sim_glove[phrases_dict[j]][phrases_dict[i]] = simi
sim_glove.shape

# 计算最终相似度

In [None]:
def normalization(data):
    _range = np.max(data) - np.min(data)
    return (data - np.min(data)) / _range

In [None]:
# 最终相似度
sim_final = np.zeros((num_phrases, num_phrases))
# 计算共现的时候重复统计了
co = np.array(co_occurance)
co = normalization(co/2)
for i in phrases_list:
    for j in phrases_list:
        if i != j:
            simi = CosineSimilarity(co[phrases_dict[i]], co[phrases_dict[j]])
            sim_final[phrases_dict[i]][phrases_dict[j]] = simi
            sim_final[phrases_dict[j]][phrases_dict[i]] = simi
sim_final.shape

In [None]:
alpha = 0.5
sim_final = sim_final * alpha + sim_glove * (1 - alpha)

# SpectralClustering聚类

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn import metrics

n_clusters, ch_max = 0, 0.0

for num in range(10, 20):
    cluster = SpectralClustering(n_clusters=num, affinity='nearest_neighbors', n_neighbors=5).fit_predict(sim_final)
    CH = metrics.calinski_harabasz_score(sim_final, cluster)
    if CH > ch_max:
        ch_max = CH
        n_clusters = num
cluster = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', n_neighbors=5).fit_predict(sim_final)
cluster, n_clusters, CH

In [None]:
phrases = pd.DataFrame.from_dict(phrases_dict, orient='index', columns=['id'])
phrases['clusters'] = cluster
phrases

In [None]:
phrases.loc['hate'].clusters

In [None]:
np.array(doc_phrases).shape

In [None]:
article_cluster = [[] for _ in range(n_clusters)]
for i in range(50):
    for j in doc_phrases[i]:
        if i not in article_cluster[phrases.loc[j].clusters]:
            article_cluster[phrases.loc[j].clusters].append(i)
article_cluster[0]

# sumy文本摘要

In [None]:
# 文本摘要
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer as Summarizer

#设置输出句子总数
SENTENCES_COUNT = 10

def plainTextSummary(data,language):
    """
    基于明文数据内容的摘要方法
    """
    
    parser = PlaintextParser.from_string(data, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

data_to_sumy = list(['. '.join(x for x in list(data.iloc[article_cluster[0]].Headline))])

plainTextSummary(data_to_sumy,'english')
print('='*80)

In [None]:
data.iloc[article_cluster[9]].Headline

In [None]:
phrases.loc[phrases.clusters == 0]

# gensim文本摘要

In [None]:
'. '.join(x for x in list(data.iloc[article_cluster[0]].Headline))

In [None]:
from gensim.summarization import summarize
result = summarize('. '.join(x for x in list(data.iloc[article_cluster[0]].Headline)), split=True, word_count=10)
result

In [None]:
import numpy as np
np.__version__

In [None]:
import gensim
help(gensim)

# Seq2Seq文本摘要

In [None]:
import pickle
import re
from pathlib import Path

import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize

from pre_trained.model import Model


def clean_str(sentence):
    sentence = re.sub("[#.]+", "#", sentence)
    return sentence


def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]


class SumGen:
    def __init__(self, model_dir="."):
        """
        Args:
            model_dir: 模型文件的保存路径, 需要包含从pre_trained中解压出来的文件
        """
        self.model_dir = model_dir
        with Path(model_dir, "pre_trained/args.pickle").open("rb") as f:
            self.args = pickle.load(f)

        print("Loading dictionary...")
        with Path(model_dir, "pre_trained/word_dict.pickle").open("rb") as f:
            self.word_dict = pickle.load(f)
        self.reversed_dict = dict(zip(self.word_dict.values(), self.word_dict.keys()))
        self.article_max_len = 50
        self.summary_max_len = 15
        
    def build_dataset(self, raw_articles: list) -> list:
        """生成数据集

        Args:
            raw_articles: 一个列表, 包含所有的文档
        """
        article_list = [clean_str(x.strip()) for x in raw_articles]
        x = [word_tokenize(d) for d in article_list]
        x = [[self.word_dict.get(w, self.word_dict["<unk>"]) for w in d] for d in x]
        x = [d[:self.article_max_len] for d in x]
        x = [d + (self.article_max_len - len(d)) * [self.word_dict["<padding>"]] for d in x]

        return x
    
    def generate(self, valid_x: list) -> list:
        """生成文本摘要

        Args:
            valid_x: 用build_dataset生成的数据集

        Returns:
            返回一个列表, 包含每一个样本生成的标题
        """
        results = []
        with tf.Session() as sess:
            # 加载模型
            print("Loading saved model...")
            model = Model(self.reversed_dict, self.article_max_len, self.summary_max_len, self.args, forward_only=True)
            saver = tf.train.Saver(tf.global_variables())
            ckpt = tf.train.get_checkpoint_state(Path(self.model_dir, "pre_trained/saved_model/").as_posix())
            saver.restore(sess, ckpt.model_checkpoint_path)

            batches = batch_iter(valid_x, [0] * len(valid_x), self.args.batch_size, 1)

            print("Generating...")
            for batch_x, _ in batches:
                batch_x_len = [len([y for y in x if y != 0]) for x in batch_x]

                valid_feed_dict = {
                    model.batch_size: len(batch_x),
                    model.X: batch_x,
                    model.X_len: batch_x_len,
                }

                prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
                prediction_output = [[self.reversed_dict[y] for y in x] for x in prediction[:, 0, :]]

                for line in prediction_output:
                    summary = list()
                    for word in line:
                        if word == "</s>":
                            break
                        if word not in summary:
                            summary.append(word)
                    results.append(" ".join(summary))

        return results

In [None]:
dataset = ['February 4 coronavirus news. November 1 coronavirus news. July 5 coronavirus news.']
dataset

In [None]:
tf.reset_default_graph()
sg = SumGen()
valid_x = sg.build_dataset(dataset)
sg.generate(valid_x)

In [None]:
import tensorflow as tf
tf.__version__

# 热度指数

考虑时间差，同一类文章数，同一类文章中不同的来源数

In [None]:
data

In [None]:
article_cluster = np.array(article_cluster)
article_cluster

In [None]:
import datetime
date = datetime.datetime.now()
date.day

In [None]:
new_date = '2019-01-10'
new_date = datetime.datetime.strptime(new_date, '%Y-%m-%d')
new_date

In [None]:
date - new_date

In [None]:
for i in range(data.shape[0]):
    data.iloc[i].Time = datetime.datetime.strptime(data.iloc[i].Time, '%Y-%m-%d')

In [None]:
timestamp = []
for i in range(n_clusters):
    delta = date - data.iloc[article_cluster[i]].Time
    time = []
    for j in range(len(delta)):
        time.append(delta.iloc[j].days)
    time = np.array(time).mean()
    timestamp.append(time)
timestamp = np.array(timestamp)

In [None]:
timestamp

In [None]:
for i in range(n_clusters):
    print(len(article_cluster[i]))

In [None]:
data.loc[data.Source == 'CNN'].shape[0]

In [None]:
len(set(data.Source))

In [None]:
len(article_cluster[0])

In [None]:
def compute_score(x, y):
    return np.log(1 + x) / np.log(1 + y)

In [None]:
alpha = 1.1
def time_score(i):
    return 2 /(1 + np.power(timestamp[i], alpha))

In [None]:
max_n = 0
for i in range(n_clusters):
    if len(article_cluster[i]) > max_n: max_n = len(article_cluster[i])
max_n

In [None]:
def number_score(i):
    return compute_score(len(article_cluster[i]), max_n)

In [None]:
def source_score_1(i):
    return 1 / np.log(1 + len(set(data.Source)) / len(set(data.iloc[article_cluster[i]].Source)))

In [None]:
def source_score_2(i):
    return compute_score((len(set(data.iloc[article_cluster[i]].Source))), len(set(data.Source)))

In [None]:
hotscore = np.zeros(n_clusters)
alpha_t, alpha_n, alpha_s = 0.4, 0.5, 0.1
for i in range(n_clusters):
    hotscore[i] = (alpha_t * time_score(i) + alpha_n * number_score(i) + alpha_s * source_score_2(i)) * 100

In [None]:
hotscore

In [None]:
n_clusters

In [None]:
time_score(10)

In [None]:
len(set(data.Source)) / len(set(data.iloc[article_cluster[10]].Source))

In [None]:
1 / np.power(2, 1.1)

In [None]:
1 / 10

In [None]:
np.log(1 + 1) / np.log(1 + 10)