## 文本摘要
* 加载数据
* 建立Textrank 模型
* 测试

### 1 数据加载

In [1]:
from load_data import load_stories

directory = 'data/cnn_stories_tokenized/'
stories = load_stories(directory, 10000)
print('Loaded Stories %d' % len(stories))

100%|██████████| 5000/5000 [00:00<00:00, 17527.30it/s]

Loaded Stories 5000





In [2]:
stories[0].keys()

dict_keys(['story', 'highlights'])

In [3]:
a = stories[2584]
a

{'story': '\n\n',
 'highlights': ['Iggy says Miami is fundamentally spiritual and has a quicksilver quality',
  "It 's a very musical town , with some damn good rock players , he says",
  "`` Miami 's a sunny place for shady people , '' says the Godfather of Punk"]}

In [4]:
b = stories[3274]
b

{'story': '\n\n',
 'highlights': ['CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery',
  'Please submit your best shots of Barcelona , Spain for next week',
  'Visit CNN.com / Travel next Wednesday for a new gallery of snapshots']}

In [5]:
stories.remove(a)
stories.remove(b)

In [6]:
len(stories)

4998

###   2 建立模型

In [7]:
class AttrDict(dict):
    """Dict that can get attribute by dot"""
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [8]:
from nltk.tokenize import sent_tokenize ## 分句
from nltk.tokenize import word_tokenize ## 分词

import networkx as nx
import numpy as np

import numpy as np
import math

def get_similarity(word_list1, word_list2):
        """默认的用于计算两个句子相似度的函数。
        Keyword arguments:
        word_list1, word_list2  --  分别代表两个句子，都是由单词组成的列表
        """
        words   = list(set(word_list1 + word_list2))        
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]

        vector3 = [vector1[x]*vector2[x]  for x in range(len(vector1))]
        vector4 = [1 for num in vector3 if num > 0.]
        co_occur_num = sum(vector4)

        if abs(co_occur_num) <= 1e-12:
            return 0.

        denominator = math.log(float(len(word_list1))) + math.log(float(len(word_list2))) # 分母

        if abs(denominator) < 1e-12:
            return 0.

        return co_occur_num / denominator
    
class TextRank4Sentence(object):
    
    def __init__(self, stop_words = None):
        """
        Keyword arguments:
        stop_words  --  set，停止词集合 方便查询
        
        """
        
        self.seg = sent_tokenize
        self.stop_words = stop_words
        
        
        self.sentences = None
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.key_sentences = None
    
    
    
    def sort_sentences(self,sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
        """将句子按照关键程度从大到小排序
        Keyword arguments:
        sentences         --  列表，元素是句子
        words             --  二维列表，子列表和sentences中的句子对应，子列表由单词组成
        sim_func          --  计算两个句子的相似性，参数是两个由单词组成的列表
        pagerank_config   --  pagerank的设置
        """
        sorted_sentences = []
        _source = words
        sentences_num = len(_source)        
        graph = np.zeros((sentences_num, sentences_num))

        for x in range(sentences_num):
            for y in range(x, sentences_num):
                similarity = sim_func( _source[x], _source[y] )
                graph[x, y] = similarity
                graph[y, x] = similarity

        nx_graph = nx.from_numpy_matrix(graph)
        scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
        sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

        for index, score in sorted_scores:
            item = AttrDict(index=index, sentence=sentences[index], weight=score)
            sorted_sentences.append(item)

        return sorted_sentences
        
    def analyze(self, text, lower = False, 
              source = 'no_filter', 
              sim_func = get_similarity,
              pagerank_config = {'alpha': 0.85,}):
        """
        Keyword arguments:
        text                 --  文本内容，字符串。
        lower                --  是否将文本转换为小写。默认为False。
        source               --  选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
                                 默认值为`'all_filters'`，可选值为`'no_filter', 'no_stop_words', 'all_filters'`。
        sim_func             --  指定计算句子相似度的函数。
        """
        
        self.key_sentences = []
        
        if lower:
            text = text.lower()
        
        result = self.seg(text)
        self.sentences = result
        
        self.words_no_filter = [ word_tokenize(s) for s in self.sentences]
        self.words_no_stop_words = []
        if self.stop_words:
            for words in self.words_no_filter:
                self.words_no_stop_words.append([])
                for w in words:
                    if w not in self.stop_words:
                        self.words_no_stop_words[-1].append(w)


        options = ['no_filter', 'no_stop_words']
        if source in options:
            if source ==  "no_filter":
                _source = self.words_no_filter
            else:
                _source = self.words_no_stop_words
        else:
            _source = self.words_no_stop_words
        
        self.key_sentences = self.sort_sentences(sentences = self.sentences,
                                                 words     = _source,
                                                 sim_func  = sim_func,
                                                 pagerank_config = pagerank_config)

            
    def get_key_sentences(self, num = 1, sentence_min_len = 0):
        """获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。
        Return:
        多个句子组成的列表。
        """
        result = []
        count = 0
        for item in self.key_sentences:
            if count >= num:
                break
            if len(item['sentence']) >= sentence_min_len:
                result.append(item)
                count += 1
        return result
    

In [9]:
tr4s = TextRank4Sentence()
tr4s.analyze(stories[0]["story"])

In [10]:
tr4s.get_key_sentences()[0]["sentence"]

"`` Clash was an adult male living a prominent public life centered around the entertainment of toddlers , while at the same time he was , in secret , preying on teenage boys to satisfy his depraved sexual interests , '' the Stephens suit alleged ."

In [19]:
from sumeval.metrics.rouge import RougeCalculator
rouge = RougeCalculator(stopwords=True)

def get_rouge(predict,answer):
    rouge_p = {1:0.0,2:0.0,3:0.0}
    for i in tqdm(range(len(predict))):
        rouge_p[1] += rouge.rouge_n(
            summary=predict[i],
            references=answer[i],
            n=1) 
        rouge_p[2] += rouge.rouge_n(
            summary=predict[i],
            references=answer[i],
            n=2) 
        rouge_p[3] += rouge.rouge_l(
            summary=predict[i],
            references=answer[i]) 

    rouge_p[1] /= len(predict)
    rouge_p[2] /= len(predict)
    rouge_p[3] /= len(predict)

    print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}".format(
    rouge_p[1], rouge_p[2], rouge_p[3]
    ).replace(", ", "\n"))


In [12]:
from nltk.corpus import stopwords 
stop_words = set( stopwords.words('english'))

tr4s = TextRank4Sentence(stop_words = stop_words)
tr4s.analyze(stories[3273]["story"],source = 'no_stop_words')

In [13]:
(stories[3273]["highlights"],tr4s.get_key_sentences()[0]["sentence"])


(['I like to be classic with a modern twist , says fashion designer',
  'The key to building a successful brand is separating creative and business sides',
  'Style is something different in everyone , Herrera says'],
 'On building a business ...\n\n-LRB- Fashion is -RRB- a very difficult business , as you know , because fashion is a business .')

### 3 测试

In [15]:
from tqdm import tqdm
tr4s_have = TextRank4Sentence()
tr4s_no = TextRank4Sentence(stop_words = stop_words)

have_stop = []
no_stop_words = []
answer = []

for s in tqdm(stories):
    tr4s_have.analyze(s["story"])
    tr4s_no.analyze(s["story"],source = 'no_stop_words')
    
    if len(tr4s_have.get_key_sentences()) != 0 and len(tr4s_no.get_key_sentences()) != 0:
        answer.append(s["highlights"])

        have_stop.append(tr4s_have.get_key_sentences()[0]["sentence"])
        no_stop_words.append(tr4s_no.get_key_sentences()[0]["sentence"])



100%|██████████| 4998/4998 [14:54<00:00,  5.59it/s]


In [20]:
get_rouge(have_stop,answer)


100%|██████████| 4996/4996 [00:12<00:00, 407.15it/s]

ROUGE-1: 0.10220309646469726
ROUGE-2: 0.025678254771702522
ROUGE-L: 0.09089767869257702





In [21]:
get_rouge(no_stop_words,answer)

100%|██████████| 4996/4996 [00:11<00:00, 435.58it/s]

ROUGE-1: 0.11711719721983674
ROUGE-2: 0.0301544862409943
ROUGE-L: 0.10366080105375121



