# Process Summaries

Define call to summarizer and summary assembler.

## Summary assembler

Summarizer returns list of segment index.

In [None]:
def summary_assembler(results, corpus, nb_words, over_window = 0) :
    """
    Build the summary of a document with the result of the summarizer.
    
    Given the score for each sentences, selectes the top <nb_words> sentences
    to build a summary.
    
    
    :param results:     List of score per sentence in the document.
                        Higher score denote a higher importance of the sentence.
    :param corpus:      Dictionary mapping a document key to document content.
    :param nb_words:    Number of words in the final summary.
    :param over_window: The number of words to add from a sentences.
                        The window over wich words are added to form a ngram.
    
    :return:   A list of sentences id picked as the top sentences for the summary.
    """
    
    # Get the id of the sentences, ie, its position in the related document.
    list_sent_id = [item[0] for item in sorted(enumerate(results), key=lambda x: x[1], reverse = True)]
    
    stops = set(stopwords.words('french'))
    
    # List of id forming the summary.
    summary_sent_id = []
    
    if over_window:
        words = set()
        for sent_id in list_sent_id:
            summary_sent_id.append(sent_id)
            # ???
            for word_id in range(sent_id * over_window, sent_id * over_window + len(corpus[sent_id].split())):
                words.add(word_id)
            if len(words) >= nb_words:
                break
    else:
        for sent_id in list_sent_id:
            summary_sent_id.append(sent_id)
            if len( " ".join([ " ".join(set(corpus[i].split())-stops) for i in summary_sent_id]).split() ) >= nb_words:
                break
    return sorted(summary_sent_id)

In [None]:
def join_and_cut_at(sentence_list, limit, stops):
    """
    Generate a sentence of fiwed length while cleaning stop words.
    
    :param sentence_list:    List of sentence to process.
    :param limit:            Size of sentences.
    :param stop:             Set of stop words.
    """
    return " ".join(" ".join([" ".join(set(s.split())-stops) for s in sentence_list]).split()[:limit])

### Multiprocessing the summarization

In [None]:
from multiprocessing import Pool

def wrapper(doc_key, summarizer, all_args) :
    """
    Wrapper over the summarize method of the summarizers.
    Passes arguments to the summarizer during multi-processing.
    
    :param doc_key:    Key of the document : document set id + document id.
    
    :returns:    A tuple : (doc_key summary_sent_id)
                     doc_key: docset id + doc id
                     summary_sent_id: list of sentences index of the document
                     forming the summary for the given document.
    
    """
    
    # all_args[0] <=> preprocessed sentences of the document.
    return (doc_key, summary_assembler(summarizer.summarize(all_args[0], all_args[-1]), *all_args[:-1]))
 

def doc_summarizer(docs, summarizer, nb_words, over_window, docs_bias = None):
    """
    Dispatch over a pool the summarization.
    Each worker runs for a document (doc_key) a summary of
    the document content (sents[1]) - array of cleaned sentences.
    
    
    :param docs:           Tokenized corpus. Array of (???).
    :param summarizer:     Summarizer process. See Summary_Processes directory.
    :param nb_words:       Length of the summary, ie, the nb of word forming it.
    :param over_window:    ???
    :param docs_bias:      Dictionnay mapping to a document key its vocabulary bias
                           (ie, biased weight for words in the document)
    
    :return:    A dictionnay of predicted summaries.
                Maps a document key (docset + doc id) to a list of
                sentences index of the document forming the summary.
    """
    ## INIT ##
    summary = {}
    
    
    with Pool(20) as p:
        list_summary = p.starmap(wrapper, [(doc_key , summarizer, (sents, nb_words, over_window, docs_bias[doc_key] if docs_bias is not None else None)) 
                  for doc_key, sents in docs.items()]) #sents[1] for other datasets
    
    summary = {k : s for k,s in list_summary}
    
    return summary