In [37]:
import numpy as np
import pandas as pd
import gc
import copy
import math
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from collections import defaultdict, Counter
from porterStemmer import PorterStemmer
from functools import reduce
from sklearn import cluster as skc
warnings.filterwarnings('ignore')
__author__ = 'willer'
#             <--------------------------------->
#             <                                 >
#             <       SWIFT PHILOSOPHY !!       >
#             <                                 >
#             <--------------------------------->

In [51]:
class UtilityFunction:
    stemmer = PorterStemmer()
    
    @classmethod
    def stemFunction(cls, sentences):
        newSentences = []
        for sentence in sentences:
            output = ''
            word = ''
            for c in sentence:
                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        output += cls.stemmer.stem(word, 0, len(word)-1)
                        word = ''
                    output += c.lower()
            if len(output) >= 5:
                newSentences.append(output)
        return newSentences
    
    @staticmethod
    def ROUGE_N(result, label, total, n=1):
        resultDictionary = Counter(result.split())
        labelDictionary  = Counter(label.split())
        totalDictionary  = Counter(total.split())
        confusionMatrix = [[0.0, 0.0], 
                           [0.0, 0.0]]
        for key, value in labelDictionary.items():
            confusionMatrix[0][0] += min(value , resultDictionary[key])
            confusionMatrix[1][1] += max(value - resultDictionary[key], 0)
            
        for key, value in resultDictionary.items():
            confusionMatrix[0][1] += max(value - labelDictionary[key],0)
            
        labelDictionary += resultDictionary
        for key, value in totalDictionary.items():
            confusionMatrix[1][0] += value - labelDictionary[key]
        
        return confusionMatrix
    
    @staticmethod
    def cosineSimilarity(vectorA, vectorB):
        return np.dot(vectorA, vectorB.transpose()) / (np.sqrt(np.sum(vectorA ** 2)) * np.sqrt(np.sum(vectorA ** 2)))
    
    @staticmethod
    def l2Distance(vectorA, vectorB):
        return np.sqrt(np.sum((vectorA - vectorB) ** 2))
    
    @staticmethod
    def cluster(sentences, clusterSize):
        X   = list(zip(*sentences))[1]
        centroid, label, _ = skc.k_means(X , clusterSize)
        res = []
        for c in centroid:
            res.append([c, []])
        for i,l in enumerate(label):
            res[l][1].append(sentences[i])
        return res
        
    @staticmethod
    def ROUGE_L():
        pass

In [82]:
class AutoSummary:
    
    def __init__(self, stopwordsFile, stemFunction):
        
        self.stemFunction = stemFunction
        self.stopwords    = set()
        self.words        = None
        self.delta        = 0.00001
        self.epsilon      = 0.0001
        self.threshold    = 0.2
        self.d            = 0.15
        self.size         = 665
        self.simThreshold = 0.7
        with open(stopwordsFile) as f:
            for line in f:
                self.stopwords |= set(line.split())
    
    def calculateSimilarMatrix(self, sentences, docs=None, getEmbedding=False) -> [[float]]:
        """
        sentences: [[str]]
        Calculate sentences self-similar matrix or doc-sentence similar matrix
        also embedding vector
        """
        if docs:
            tfSen  = pd.DataFrame(np.zeros((len(sentences),len(self.words))), columns=list(self.words))
            tfDoc  = pd.DataFrame(np.zeros((len(docs)     ,len(self.words))), columns=list(self.words))
            isfSen = pd.DataFrame(np.ones((1              ,len(self.words))), columns=list(self.words))
            isfDoc = pd.DataFrame(np.ones((1              ,len(self.words))), columns=list(self.words))

            for i,s in enumerate(sentences):
                dic = Counter(s.split())
                for k,v in dic.items():
                    if k in self.words:
                        tfSen .iloc[i][k] += v / len(s)
                        isfSen.iloc[0][k] += 1
                        
            for i,d in enumerate(docs):
                dic = Counter(d.split())
                for k,v in dic.items():
                    if k in self.words:
                        tfDoc .iloc[i][k] += v / len(d)
                        isfDoc.iloc[0][k] += 1
                
            isfSen = np.log(len(sentences) / isfSen) 
            isfDoc = np.log(len(docs)      / isfDoc)
            for i in range(len(sentences)):
                tfSen.iloc[i] = tfSen.iloc[i].mul(isfSen.iloc[0])
            for i in range(len(docs)):
                tfDoc.iloc[i] = tfDoc.iloc[i].mul(isfDoc.iloc[0])
            
            tfSen         = np.array(tfSen)
            tfDoc         = np.array(tfDoc)
            similarMatrix = np.dot(tfSen, tfDoc.transpose())
            innerSen      = np.sum(np.multiply(tfSen, tfSen), axis=1, keepdims=True)
            innerDoc      = np.sum(np.multiply(tfDoc, tfDoc), axis=1, keepdims=True)
            product       = np.sqrt(np.dot(innerSen, innerDoc.transpose()))
            similarMatrix = similarMatrix / product
            
        else:
            tf  = pd.DataFrame(np.zeros((len(sentences),len(self.words))), columns=list(self.words))
            isf = pd.DataFrame(np.ones((1,              len(self.words))), columns=list(self.words))

            for i,s in enumerate(sentences):
                dic = Counter(s.split())
                for k,v in dic.items():
                    if k in self.words:
                        tf .iloc[i][k] += v / len(s)
                        isf.iloc[0][k] += 1
                        
            isf = np.log(len(sentences) / isf) 
            for i in range(len(sentences)):
                tf.iloc[i] = tf.iloc[i].mul(isf.iloc[0])
            tf_isf_Matrix  = np.array(tf)
            if getEmbedding:
                return tf_isf_Matrix
            innerMatrix    = np.sum(np.multiply(tf_isf_Matrix, tf_isf_Matrix), axis=1, keepdims=True)
            innerMatrix    = np.sqrt(np.dot(innerMatrix, innerMatrix.transpose()))
            similarMatrix  = np.dot(tf_isf_Matrix, tf_isf_Matrix.transpose())
            similarMatrix /= innerMatrix
        
        gc.collect()
        return similarMatrix
    
    
    def sentencesSort(self, docList, method) -> [str]:
        """
        docList = [[str]]
        method in {'lexrank' , 'dochits', 'cluster', 'cosine', 'none'}
        """
        sentences = reduce(lambda x,y : x+y, docList)
        
        
        if method == 'lexrank':

            similarMatrix = self.calculateSimilarMatrix(sentences)
            degree        = np.zeros(len(sentences))
            for i in range(len(sentences)):
                for j in range(len(sentences)):
                    if similarMatrix[i][j] > self.threshold:
                        similarMatrix[i][j] = 1
                        degree[i]          += 1
                    else:
                        similarMatrix[i][j] = 0
                
            for i in range(len(sentences)):
                for j in range(len(sentences)):
                    similarMatrix[i][j]    /= degree[i]
                
            U = np.ones((len(sentences), len(sentences))) / len(sentences)
            similarMatrix = self.d * U + (1 - self.d) * similarMatrix
            last_p = p = np.ones(len(sentences)) / len(sentences)
            while True:
                p    = np.dot(similarMatrix.transpose(), p)
                loss = np.sum(np.abs(p - last_p))
                if loss < self.epsilon:
                    break
                last_p = p     
            sentences = list(zip(sentences, p))
            sentences.sort(key = lambda x: x[1], reverse=True)
            sentences = list(zip(*sentences))[0]
        
        elif method == 'dochits':
    
            docs   = [reduce(lambda x,y : x+y, doc) for doc in docList]
            L      = self.calculateSimilarMatrix(sentences, docs)
            last_A = A = np.ones((len(sentences), 1))
            last_H = H = np.ones((len(docs)     , 1))
            while True:
                A  = np.dot(L            , last_H)
                H  = np.dot(L.transpose(), last_A)
                A /= np.linalg.norm(A)
                H /= np.linalg.norm(H)
                loss_A = np.sum(last_A - A)
                loss_H = np.sum(last_H - H)
                if max(loss_A, loss_H) < self.delta:
                    break
                last_A = A
                last_H = H
            sentences = list(zip(sentences, A))
            sentences.sort(key = lambda x: x[1], reverse=True)
            sentences = list(zip(*sentences))[0]
            
        elif method == 'cluster':
            docSentence    = [reduce(lambda x,y : x+y, doc) for doc in docList]
            totalSentence  = reduce(lambda x,y : x+y, docSentence)
            countCluster   = math.floor(np.sqrt(len(sentences)))
            embedding      = self.calculateSimilarMatrix(sentences, getEmbedding=True)
            totalEmbedding = self.calculateSimilarMatrix(totalSentence, getEmbedding=True)[0]
            sentences      = list(zip(sentences, embedding))
            embeddingShape = totalEmbedding.shape[0]
            cluster        = UtilityFunction.cluster(sentences, countCluster)
            totalSize      = []
            countSize      = []
            orderSentences = []
            cluster.sort(key = lambda x: UtilityFunction.cosineSimilarity(x[0], totalEmbedding), reverse=True)
            for clu in cluster:
                clu[1].sort(key = lambda x: UtilityFunction.cosineSimilarity(x[1], clu[0]), reverse=True)
                totalSize.append(len(clu[1]))
                countSize.append(0)
            totalSum = sum(totalSize)
            count    = 0
            index    = 0
            while count < totalSum:
                if countSize[index] < totalSize[index]:
                    orderSentences.append(cluster[index][1][countSize[index]][0])
                    countSize[index] += 1
                    count += 1
                index += 1
                if index == countCluster:
                    index = 0
            sentences = orderSentences
            
        elif method == 'cosine':
            docSentence    = [reduce(lambda x,y : x+y, doc) for doc in docList]
            totalSentence  = reduce(lambda x,y : x+y, docSentence)
            embedding      = self.calculateSimilarMatrix(sentences, getEmbedding=True)
            totalEmbedding = self.calculateSimilarMatrix([totalSentence], getEmbedding=True)[0]
            sentences      = list(zip(sentences, embedding))
            sentences.sort(key = lambda x: UtilityFunction.cosineSimilarity(x[1], totalEmbedding), reverse=False)
            sentences      = list(zip(*sentences))[0]
            
        elif method == 'none':
            pass
        
        else:
            raise NameError(" === Method '{}' Not Found == ".format(method))
            
        gc.collect()
        return sentences
    
    
    def getSummary(self, fileList, method='lexrank') -> str:
        """
        Redundancy Control and Summary
        
        threshold: control similarity between sentences
        size: summary length
        """
        self.words = set()
        docList    = []
                                
        for file in fileList:
            docList.append(self.getFileText(file))
            
        for i in range(len(docList)):
            doc         = docList[i].split('.')
            docList[i]  = self.stemFunction(doc)
            for sentence in docList[i]:
                self.words |= set(sentence.split())
        self.doc    = [reduce(lambda x,y : x+y, doc) for doc in docList]
        self.docs   = reduce(lambda x,y : x+y, self.doc)
        self.words -= self.stopwords
        sentences   = self.sentencesSort(docList, method) 
        summary     = [sentences[0]]
        similarMatrix           = self.calculateSimilarMatrix(sentences)       
        remainingSummarySize    = self.size - len(sentences[0])
        lastSentencePosition    = 0
        currentSentencePosition = 1
        sentencesCount          = len(sentences) 
        while remainingSummarySize > 0 and currentSentencePosition < sentencesCount:
            if similarMatrix[lastSentencePosition][currentSentencePosition] < self.simThreshold:
                summary.append(copy.deepcopy(sentences[currentSentencePosition]
                                             [: min(len(sentences[currentSentencePosition]), remainingSummarySize)]))
                remainingSummarySize -= len(sentences[currentSentencePosition])
                lastSentencePosition = currentSentencePosition
            currentSentencePosition += 1
        gc.collect()
        return reduce(lambda x,y: x+y, summary)
        
  
    def getBaselineSummary(self, fileList) -> str:
        """
        Get first sentences in each document as baseline.
        """
        baselineSummary = []
        for file in fileList:
            baselineSummary += [self.getFileText(file).split('.')[0]]
        baselineSummary = UtilityFunction.stemFunction(baselineSummary)
        return reduce(lambda x,y: x+y, baselineSummary)[:self.size]
    
    def testPerformance(self, performanceFunction, result, filename) -> (float, float, float):
        """
        Confusion Matrix Form
        
        True Positive |  False Positive
        -------------------------------
        True Negative |  False Negative
        
        Return Recall Precision and F-1 Score
        """
        
        label = self.stemFunction([self.getLabelText(filename)])[0]
        confusionMatrix = performanceFunction(result, label, self.doc[0])
        recall    = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[1][1])
        precision = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[0][1])
        f1Score   = 2 * recall * precision / (recall + precision) 
        
        return recall, precision, f1Score
    
    def getLabelText(self, filename) -> str:
        """
        Get All Content
        """
        label = ""
        with open(filename) as f:
            for line in f:
                label += line[:-1]
        return label

    
    def getFileText(self, filename) -> str:
        """
        Get content bewteen <TEXT> and </TEXT>
        """
        with open(filename) as f:
            doc = ""
            addFlag = False
            for line in f:
                if line[:6] == '<TEXT>':
                    addFlag = True
                    continue
                elif line[:7] == '</TEXT>':
                    addFlag = False
                if addFlag:
                    doc += line[:-1]
        return doc
           
        
if __name__ == '__main__':
    
    PATH      = '/Users/willer/Downloads/dataset/DUC04/unpreprocess data/docs/'
    PATH_RES  = '/Users/willer/Downloads/dataset/DUC04/model/04model/'
    fileL     = ['d30001t/APW19981016.0240', 'd30001t/APW19981022.0269', 'd30001t/APW19981026.0220']
    
    # 将一整个topic的十个文件加载到fileList里面
    fileList  = [PATH + f for f in fileL]
    # 对应topic的专家摘要
    labelfile = PATH_RES + 'D30001.M.100.T.A'
    
    
    instance  = AutoSummary('stopwords.txt', UtilityFunction.stemFunction)
    baseline  = instance.getBaselineSummary(fileList)
    summary   = instance.getSummary(fileList, method='lexrank')
    r, p ,f   = instance.testPerformance(UtilityFunction.ROUGE_N, summary, labelfile)
    print("Summary:\n\r\n\r", summary)
    print("\n\r")
    print("Recall    : {:.6f}\nPrecision : {:.6f}\nF1-Score  : {:.6f}".format(r, p, f))
    

Summary:

 '' both ranariddh and sam rainsi have been outsid the countri sinc parliament wa ceremoni open on  hun sen's parti won 64 of the 122 seat in parliament in juli's nation elect, but not the two-third major necessari to form a govern on it  cite hun sen's threat to arrest opposit politician follow two alleg attempt on hi life, ranariddh and sam rainsi have said thei do not feel safe negoti insid the countri and ask the king to chair the summit at hi resid in  negoti to form the next govern have becom deadlock, and opposit parti leader princ norodom ranariddh and sam rainsi ar out of the countri follow threat of arrest from strongman hun '' the  govern and oppo


Recall    : 0.376238
Precision : 0.311475
F1-Score  : 0.340807


| Method | Recall | Precision | F1-Score|
|:-:|:-:|:-:|:-:|
|BASELINE| 0.2475  | 0.3472    | 0.2890  |
|cosine | 0.2178| 0.1897| 0.2027|
|dochits| 0.3861| 0.3277| 0.3545|
|lexrank| 0.3762| 0.3115| 0.3408|
|cluster| 0.3960| 0.3225| 0.3557|
|none   | 0.4257| 0.3644| 0.3926|