In [None]:
import xml.sax as sax
from xml.sax import SAXParseException
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import unicodedata

In [None]:
class TextHandler(sax.ContentHandler):
    """
    Custom Handler for parsing the documents of the duc corpus.
    """
    
    def __init__(self) :
        self._activeParse = False
        self._result = ""
    
    def startDocument(self):
        pass

    def startElement(self, name, attrs):
        if name in {"TEXT", "LEADPARA"}:
            self._activeParse = True

    def endElement(self, name):
        if name in {"TEXT", "LEADPARA"} :
            self._activeParse = False

    def characters(self, content):
        if self._activeParse :
            self._result += content.replace('\n'," ")

In [None]:
class SummaryHandler(sax.ContentHandler):
    """
    Custom Handler for parsing the summaries of the duc corpus.
    """
    
    def __init__(self) :
        self._activeParse = False
        self._charBuffer = ""
        self._active_doc = ""
        self._result = []
    
    def startDocument(self):
        pass

    def startElement(self, name, attrs):
        if name == "SUM" :
            self._activeParse = True
            self._active_doc = attrs["DOCREF"].strip(" ")
            self._charBuffer = ""

    def endElement(self, name):
        if name == "SUM" :
            self._activeParse = False
            self._result.append((self._active_doc, self._charBuffer))
            

    def characters(self, content):
        if self._activeParse :
            self._charBuffer += content.replace('\n'," ")

In [None]:
def parse_file(file, handler):
    """
    Reads and parse the given xml sax file.
    Is used to secure exceptions of the parser
    
    :param file:       Path to the file to parse.
    :param handler:    Handler for the xml sax parser.
                       Must implement one of xml.sax.handler
                       base classe. Handler holds return value.
                       
    :return:    The result value of the handler.
    """
    try :
        sax.parse(open(file, encoding = 'utf-8'), handler)
        return handler._result
    except SAXParseException as e:
        print(file,"contains some errors")
        return None

In [None]:
def generic_cleaner(sentence) : 
    #Method used to cleand a sentence of all diacritics all characters likes these.
    return re.sub(r"\s+"," ",
                  re.sub(r"[^a-zA-Z0-9]"," ",
                         unicodedata.normalize('NFKD', sentence).encode('ASCII', 'ignore').decode("utf-8")
                        )
                 ).lower().strip()

In [None]:
def tokenizer_cleaner(corpus) :
    # Method to create cleaned sentences with nltk
    docs = {}
    for key, sents in corpus.items() :
        sentences = sent_tokenize(sents, language='french')
        cleaned = []
        for sen in sentences :
            
            cleaned_sen = generic_cleaner(sen)
            if len(cleaned_sen.split()) > 1:
                cleaned.append(cleaned_sen)
        docs[key] = (sentences, cleaned)
    return docs

In [None]:
def brutal_tokenizer(corpus, n) :
    #Create sentences by cutting the document in portions of n words
    docs = {}
    for key, sents in corpus.items() :
        toks = generic_cleaner(sents).split(" ")
        sentences = [" ".join(toks[x*n:x*n+n]) for x in range(len(toks)//n)]
        cleaned = sentences
        docs[key] = (sentences, cleaned)
    return docs

In [None]:
def overlap_tokenizer(corpus, block_size, over_window) :
    #Create sentences by cutting the document in portions of n words
    docs = {}
    for key, sents in corpus.items() :
        toks = generic_cleaner(sents).split(" ")
        sentences = []
        if len(toks) >= block_size :
            sentences = [" ".join(toks[x*over_window:x*over_window+block_size+1])
                         for x in range( (len(toks)-block_size)//over_window+1)]
        cleaned = sentences
        docs[key] = (sentences, cleaned,(block_size, over_window))
    return docs

In [None]:
def summary_assembler(results, corpus, nb_words, over_window = 0) :
    """
    Build the summary of a document with the result of the summarizer.
    
    Given the score for each sentences, selectes the top <nb_words> sentences
    to build a summary.
    
    
    :param results:     List of score per sentence in the document.
                        Higher score denote a higher importance of the sentence.
    :param corpus:      Dictionary mapping a document key to document content.
    :param nb_words:    Number of words in the final summary.
    :param over_window: The window over wich words are added to form a ngram.
    
    :return:   A list of sentences id picked as the top sentences for the summary.
    """
    
    # Get the id of the sentences, ie, its position in the related document.
    list_sent_id = [item[0] for item in sorted(enumerate(results), key=lambda x: x[1], reverse = True)]
    
    stops = set(stopwords.words('french'))
    
    # List of id forming the summary.
    summary_sent_id = []
    
    if over_window:
        words = set()
        for sent_id in list_sent_id:
            summary_sent_id.append(sent_id)
            # ???
            for i in range(sent_id * over_window, sent_id * over_window + len(corpus[sent_id].split())):
                words.add(i)
            if len(words) >= nb_words:
                break
    else:
        for sent_id in list_sent_id:
            summary_sent_id.append(sent_id)
            if len( " ".join([ " ".join(set(corpus[i].split())-stops) for i in summary_sent_id]).split() ) >= nb_words:
                break
    return sorted(summary_sent_id)

In [None]:
def join_and_cut_at(sentence_list, limit, stops):
    #Generate a sentence of limit words while cleaning stop words
    return " ".join(" ".join([" ".join(set(s.split())-stops) for s in sentence_list]).split()[:limit])

In [None]:
from multiprocessing import Pool

def wrapper(doc_key, summarizer, all_args) :
    """
    Wrapper over the summarize method of the summarizers.
    Passes arguments to the summarizer during multi-processing.
    
    :param doc_key:    Key of the document : document set id + document id.
    
    :returns:    A tuple : (doc_key summary_sent_id)
                     doc_key: docset id + doc id
                     summary_sent_id: list of sentences index of the document
                     forming the summary for the given document.
    
    """
    
    # all_args[0] <=> preprocessed sentences of the document.
    return (doc_key, summary_assembler(summarizer.summarize(all_args[0]), *all_args))
 

def doc_summarizer(docs, summarizer, nb_words, over_window):
    """
    Dispatch over a pool the summarization.
    Each worker runs for a document (doc_key) a summary of
    the document content (sents[1]) - array of cleaned sentences.
    
    
    :param docs:           Tokenized corpus. Array of (???).
    :param summarizer:     Summarizer process. See Summary_Processes directory.
    :param nb_words:       Length of the summary, ie, the nb of word forming it.
    :param over_window:    ???
    
    :return:    A dictionnay of predicted summaries.
                Maps a document key (docset + doc id) to a list of
                sentences index of the document forming the summary.
    """
    
    ## INIT ##
    corpus = [sen for sents in docs.values() for sen in sents[1]]
    summarizer.preprocess(corpus)
    print("Preprocess done")
    summary = {}
    
    # Multiprocessing for accelerate calculus but use more CPU
    with Pool(1) as p:
        list_summary = p.starmap(wrapper, [(doc_key , summarizer, (sents[1], nb_words, over_window)) 
                  for doc_key, sents in docs.items()])
    
    summary = {k : s for k,s in list_summary}
    
    return summary

In [None]:
def get_summaries(dirpath):
    """
    Retrieve summaries from the given directory.
    
    Walks throught the given directory. For each sub directory,
    opens, reads and parses a file mapping multiple textual summaries
    to their source document. This file is named "perdocs.correct".
    If the file doesn't exist, a message is print on stdin
    with the path to the missing file.
    
    :Example:
    
    dirpath/
        docset_0/
            perdocs.correct
        docset_1/
            perdocs.correct
        ...
        docset_n/
            perdocs.correct
    
    :param dirpath:    Path to the directory of summaries.
                       Subdirectories reference a topic / document set (ex: d061 -> "Gilbert's Hurricane").
    
    :return:    summary_coprus: Dictionary mapping a string to a dictionary.
                    A dictionary mapping a topic key and a document id to a list of sumaries.
                    Summaries keys are letters referencing a summarizer.
                 
    :Example:
    
    {'d061j/AP880911-0016':
        {'b': 'summary text',
         'i': ''
        },
     'd061/P880912-0095':
     ...
    }
    """
         
    #Recupère les resumés des fichiers news
    
    walker = os.walk(dirpath)
    _, subdirnames, _ = next(walker)

    summary_corpus = defaultdict(lambda : {})
    
    for subdirname in subdirnames:
        stream = None
        try :
            subdirpath = os.path.join(dirpath, subdirname)
            index_path = os.path.join(subdirpath, "perdocs.correct")
            stream = parse_file(index_path, SummaryHandler())
        except FileNotFoundError : 
            print("%s does not exist" % index_path)
        if stream :
            for (doc_id, sum_text) in stream:
                summary_corpus[subdirname[:-1] + "/" + doc_id][subdirname[-1]] = sum_text
    return summary_corpus

In [None]:
def summary_doc_linker(summaries, docs) :
    """
    Associate 
    
    :param summaries:    Dictionary mapping document set id and document id to multiple
                         summaries (ex : output of get_summaries).
    
    :param docs:         Dictionary mapping document set id and document id to an
                         array of sentences.
    
    
    ..sealso:: get_summaries
    """
    #Associe le corpus de news avec les fichiers news.
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit([sen for sents in docs.values() for sen in sents[1] ])
    overall = defaultdict(lambda : {})
    
    for ref in summaries:
        try :
            X = tfidf_vectorizer.transform(docs[ref][1])
            for anot in summaries[ref] :
                gold = list(tokenizer_cleaner({ref : summaries[ref][anot]}).values())[0][1]
                Y = tfidf_vectorizer.transform(gold)
                M = cos_sim(X,Y)
                overall[ref][anot] = np.argmax(M, axis = 0)
        except KeyError as e :
            print("File not parsed :",e)
    
    return overall

In [None]:
from IPython.core.display import display, HTML

from IPython.display import clear_output
def html_gen(docs, resumes) :
    #Generateur de pages html pour y afficher le résumé extrait
    for doc in set(docs.keys()).intersection(set(resumes[0].keys())) :
        
        s = '<h2>'+doc+'</h2><p style="text-align:justify">'
        m1, m2, m3 = 0,0,0
        
        for i in range(len(docs[doc][0])) :
            k = len(resumes)
            m1 = 160 * (i in resumes[0][doc])
            
            if k-1 :
                m2 = 160 * (i in resumes[1][doc])
                
                if k-2 :
                    m3 = 160 * (i in resumes[2][doc])
            
            s += ('<span style="color:rgb('+str(m1)+','+str(m2)+','+str(m3)+')'
                +';background-color:rgb('+str(255)+',255,'+str(255)+')">'
                + docs[doc][0][i]
                +" </span>")
        
        display(HTML(s+"</p>"))