# Utilities for corpus DUC

# TODO

- [ ] vectorizer directly in pipeline
- [ ] summary_doc_linker : pass vectorizer in param

## imports

In [None]:
import os
import xml.sax as sax
from xml.sax import SAXParseException


%run datasets_utils/preprocessors.ipynb

## Global variable : paths

In [None]:
# Path to data folder
DATA_PATH = "../data"

# Path to the dataset
DATASET_PATH= os.path.join(DATA_PATH, "DUC")

# Path to the documents
DOCS_DIR = os.path.join(DATASET_PATH, "docs")

# Path to the summaries
SUMMARIES_DIR = os.path.join(DATASET_PATH, "summaries")

# Path to directory in which intermediate data will be stored
INTERMEDIATE_FILE_PATH= os.path.join(DATASET_PATH, "duc-intermediate/")

if not os.path.exists(INTERMEDIATE_FILE_PATH):
    os.makedirs(INTERMEDIATE_FILE_PATH)

## Generate corpus

### Read and parse the documents.

In [None]:
class TextHandler(sax.ContentHandler):
    """
    Custom Handler for parsing the documents of the duc corpus.
    """
    
    def __init__(self) :
        self._activeParse = False
        self._result = ""
    
    def startDocument(self):
        pass

    def startElement(self, name, attrs):
        if name in {"TEXT", "LEADPARA"}:
            self._activeParse = True

    def endElement(self, name):
        if name in {"TEXT", "LEADPARA"} :
            self._activeParse = False

    def characters(self, content):
        if self._activeParse :
            self._result += content.replace('\n'," ")

In [None]:
def parse_file(file, handler):
    """
    Reads and parse the given xml sax file.
    Is used to secure exceptions of the parser
    
    :param file:       Path to the file to parse.
    :param handler:    Handler for the xml sax parser.
                       Must implement one of xml.sax.handler
                       base classe. Handler holds return value.
                       
    :return:    The result value of the handler.
    """
    try :
        sax.parse(open(file, encoding = 'utf-8'), handler)
        return handler._result
    except SAXParseException as e:
        print(file,"contains some errors")
        return None

### Read and parse the summaries

In [None]:
class SummaryHandler(sax.ContentHandler):
    """
    Custom Handler for parsing the summaries of the duc corpus.
    """
    
    def __init__(self) :
        self._activeParse = False
        self._charBuffer = ""
        self._active_doc = ""
        self._result = []
    
    def startDocument(self):
        pass

    def startElement(self, name, attrs):
        if name == "SUM" :
            self._activeParse = True
            self._active_doc = attrs["DOCREF"].strip(" ")
            self._charBuffer = ""

    def endElement(self, name):
        if name == "SUM" :
            self._activeParse = False
            self._result.append((self._active_doc, self._charBuffer))
            

    def characters(self, content):
        if self._activeParse :
            self._charBuffer += content.replace('\n'," ")

In [None]:
def get_summaries(dirpath):
    """
    Retrieve summaries from the given directory.
    
    Walks throught the given directory. For each sub directory,
    opens, reads and parses a file mapping multiple textual summaries
    to their source document. This file is named "perdocs.correct".
    If the file doesn't exist, a message is print on stdin
    with the path to the missing file.
    
    :Example:
    
    dirpath/
        docset_0/
            perdocs.correct
        docset_1/
            perdocs.correct
        ...
        docset_n/
            perdocs.correct
    
    :param dirpath:    Path to the directory of summaries.
                       Subdirectories reference a topic / document set (ex: d061 -> "Gilbert's Hurricane").
    
    :return:    summary_coprus: Dictionary mapping a string to a dictionary.
                    A dictionary mapping a topic key and a document id to a list of sumaries.
                    Summaries keys are letters referencing a summarizer.
                 
    :Example:
    
    {'d061j/AP880911-0016':
        {'b': 'summary text',
         'i': ''
        },
     'd061/P880912-0095':
     ...
    }
    """
         
    #Recupère les resumés des fichiers news
    
    walker = os.walk(dirpath)
    _, subdirnames, _ = next(walker)

    summary_corpus = defaultdict(lambda : {})
    
    for subdirname in subdirnames:
        stream = None
        try :
            subdirpath = os.path.join(dirpath, subdirname)
            index_path = os.path.join(subdirpath, "perdocs.correct")
            stream = parse_file(index_path, SummaryHandler())
        except FileNotFoundError : 
            print("%s does not exist" % index_path)
        if stream :
            for (doc_id, sum_text) in stream:
                summary_corpus[subdirname[:-1] + "/" + doc_id][subdirname[-1]] = sum_text
    return summary_corpus

### Overall

In [None]:
def summary_doc_linker(summaries, docs) :
    """
    Associate 
    
    :param summaries:    Dictionary mapping document set id and document id to multiple
                         summaries (ex : output of get_summaries).
    
    :param docs:         Dictionary mapping document set id and document id to an
                         array of sentences.
    
    
    ..sealso:: get_summaries
    """
    #Associe le corpus de news avec les fichiers news.
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit([sen for sents in docs.values() for sen in sents])
    overall = defaultdict(lambda : {})
    
    for ref in summaries:
        try :
            X = tfidf_vectorizer.transform(docs[ref])
            for anot in summaries[ref] :
                gold = list(tokenizer_cleaner({ref : summaries[ref][anot]}).values())[0][1]
                print("DEBUG", "SUMMARY DOC LINKER", "GOLD", gold)
                Y = tfidf_vectorizer.transform(gold)
                M = cos_sim(X,Y)
                overall[ref][anot] = np.argmax(M, axis = 0)
        except KeyError as e :
            print("File not parsed :",e)
    
    return overall

### Bring it all together

In [None]:
def generate_corpus_duc(language, sampling = 1 ):
    """
    Generate a corpus from the DUC dataset with documents and summaries.
    
    :param sampling:    float. Threshold. Must be between 0.0 and 1.0
                        For each document in the data set, a random number
                        is drawn (between 0 and 1). If smaller than the
                        threshold, the document is kept in the final corpus.
                        Else, it's discarded.
                        
    :return:    docs: Dictionary mapping string to a string.
                      Maps a docset id + docid to a parsed and tokenized document.
                      'd061j/AP880911-0016' -> list(list of sentences unprocessed, list of cleaned sentences)
                golden_summaries: Dictionary mapping a string to a dictionary.
                      Maps a docset id + docid to multiple parsed and tokenized summaries.
                overall: Dictionary
    """
    # Used for generating the corpus from duc
    ## Preprocess des documents
    doc_corpus = {}
    walker = os.walk(DOCS_DIR)
    _ = next(walker)

    #Read all documents in subfolders  of the original folder
    for docset, _, docnames in walker :
        for docname in docnames :
            docpath = os.path.join(docset, docname)
            parsed_doc = parse_file(docpath, TextHandler())
            if parsed_doc :
                if random.random() < sampling:
                    doc_key = os.path.join(docset.split("/")[-1], docname)
                    doc_corpus[doc_key] = parsed_doc
    print("Loading done")
    
    #Sentence Tokenization of the corpus
    #tokenizer = Tokenizer(language, method, len_sen, over, min_doc_len=3)
    #tokenized_docs = run_pipeline(doc_corpus, _doc_wrapper, {"tokenizer": tokenizer})
    
    #Cleaning part
    #tokenized_docs = {k : tokenized_docs[k] for k in tokenized_docs if len(tokenized_docs[k])> 3}
    
    #Generating summaries
    gold_sum_dict = get_summaries(SUMMARIES_DIR)
    #overall = summary_doc_linker(summary_corpus, tokenized_docs)
    #gold_tokenized_summaries = {x : tokenizer_cleaner(summary_corpus[x]) for x in summary_corpus}
    
    return doc_corpus, gold_sum_dict#, overall