In [1]:
import numpy as np
from sklearn.preprocessing import normalize
import re
import sklearn
from sklearn import decomposition
import seaborn
from math import log

In [2]:
def pagerank(A, eps=0.0001, d=0.85, max_iter = 5000):
    """
    PageRank algorithm.
    
    Given a similarity matrix, returns list of score for each sentence.

    :param A:         matrix (n, n)
                      The adjacency matrix of the graph on which to compute PageRank score
                      
    :param eps:       float, optional
                      Tolerance. The algorithm stops as soon as the update magnitude of all
                      values is below this threshold.
              
    :param d:         float, optional
                      1 - probability of teleporting to a random node
            
    :param max_iter:  int, optional
                      Maximum number of iterations
                      
    :return:    A matrix (n, n).
                The ranking of sentences in the document.
    """
    
    # P is the vector of probability to "teleport" on each node. By default filled of 1/n.
    P = np.ones(len(A)) / len(A)
    
    while max_iter > 0 :
        max_iter-=1
        # Markov chain transition
        new_P = np.ones(len(A)) * (1 - d) / len(A) + d * A.dot(P)
        # Normalization
        new_P = new_P / np.linalg.norm(new_P)
        # Compute mean absolute error (MAE)
        delta = abs(new_P - P).sum() / len(new_P)
        if delta <= eps:
            return new_P
        P = new_P
    
    print("Convergence error : " + str(delta))
    return new_P

In [3]:
def tr_summarizer(matrix, corpus, weights=None, nb_words = 100, diag = "none", bias = 0):
    """
    Prepare the sentence-term matrix before applying PageTank algorithm to it.
    
    :param matrix:    matrix (n, m)
                      A sentence-term matrix weighting the importance of a term for a sentence.
                      
    :param corpus:    list of string.
                      A single document.
    
    :param weights:   ???
    :param nb_words:  The number of words for the summary.
    :param diag:      ???
    :param biais:     ???
    """
    
    matrix = np.array(matrix)
    
    if diag == "before":
         np.fill_diagonal(matrix,0)   
    
    sim_matrix = normalize(matrix , norm = 'l1', axis = 0)
    
    if (not weights is None) and (len(weights) == matrix.shape[0]):
        sim_matrix = np.matmul(sim_matrix,np.diag(weights))
    
    if diag == "after" :
        np.fill_diagonal(matrix,0)

    sim_matrix = sim_matrix + bias / matrix.shape[0]
    results = pagerank(sim_matrix)

    return results

In [5]:
def lsa_summarizer(matrix, corpus, weights=None, nbcompfun = None, nb_words = 100, diag = "none", bias = 0):
    """
    :param matrix:    matrix (n, m)
                      sentence-term matrix weighting the importance of a term for a sentence.
    
    :param corpus:    list of string.
                      A single document.
    :param weights:   ???
    :param nbcompfun: ???
    :param nb_words:  The number of words for the summary.
    :param diag:      ???
    :param biais:     ???
    
    """
    #Méthode de calcul LSA
    matrix = np.array(matrix)
    
    if nbcompfun == None:
        nbcompfun = lambda x : log(x)
    
    k = max(1, int(nbcompfun(len(corpus))))
    
    if k >= len(corpus):
        return np.sum(matrix,axis=1)
    
    if diag == "before" :
        np.fill_diagonal(matrix,0) 
    
    sim_matrix = normalize(matrix , norm = 'l1', axis = 0)
    
    if (not weights is None) and (len(weights) == matrix.shape[0]):
        sim_matrix = np.matmul(sim_matrix,np.diag(weights))
    
    if diag == "after" :
        np.fill_diagonal(matrix,0)
    sim_matrix = sim_matrix + bias / matrix.shape[0]

    #tsvd = sklearn.decomposition.TruncatedSVD(k, random_state=1337)
    tsvd = sklearn.decomposition.TruncatedSVD(k, random_state=1337, algorithm = "arpack")
    # Dimension reduction of sim_matrix using truncated SVD.

    # results = topic-sentence matrix
    results = tsvd.fit_transform(sim_matrix)
    # scores = sigular_values @ result.T
    # scores matrix describe how much a a sentence represent a word, thus
    # the weight / importance of a word in a sentence.
    scores = np.abs(np.matmul(results, np.diag(np.sqrt(tsvd.singular_values_))))
    # Give a score per sentence as the sum of the weights of the words it contains
    # score_sent = sum(row)
    # REMARK : Text Summarization Techniques: A Brief survey (paper) suggest sqrt(sum(scores_ij^2))
    maxscores = np.sum(scores,axis=1)

    return maxscores

In [None]:
def generic_summarizer(method, matrix, corpus, weighted, lsanbcompfun = lambda x : log(x), diag = "none", bias = 0) :
    #Is used to determine which method has to be used
    if not lsanbcompfun : 
        lsanbcompfun = lambda x : 1
    if method == "tr":
            if weighted:
                return tr_summarizer(matrix, corpus, get_weights(corpus), diag = diag, bias = bias)
            else:
                return tr_summarizer(matrix, corpus, diag = diag, bias = bias)
    else:
        if method == "lsa":
            if weighted:
                return lsa_summarizer(matrix, corpus, get_weights(corpus),
                                      nbcompfun = lsanbcompfun, diag = diag , bias = bias)
            else:
                return lsa_summarizer(matrix, corpus, nbcompfun = lsanbcompfun, diag = diag, bias = bias)

In [4]:
def get_weights(corpus):
    weights = []
    nbsent = len(corpus)
    sentindex=nbsent+1
    for sent in corpus :
        splitsent = sent.split()
        weights.append(len(splitsent) / ((sentindex**0.5)))
        sentindex -= 1.0        
    weights = np.array(weights)
    weights = nbsent*weights/np.sum(weights)
    return weights  