# DMOZ utils

# TODO

- [ ] vectorizer directly in pipeline

## Imports

In [None]:
import os

%run datasets_utils/preprocessors.ipynb

## Global variables (paths ect)

In [None]:
# Path to data folder
DATA_PATH = "../data"

# Path to the dataset
DATASET_PATH= os.path.join(DATA_PATH, "dmoz")

# Path to the documents
DOCS_DIR = os.path.join(DATASET_PATH, "dmoz-fr-content.tsv")

# Path to the summaries
SUMMARIES_DIR = os.path.join(DATASET_PATH, "dmoz-fr-description")

# Path to directory in which intermediate data will be stored
INTERMEDIATE_FILE_PATH= os.path.join(DATASET_PATH, "dmoz-intermediate/")

if not os.path.exists(INTERMEDIATE_FILE_PATH):
    os.makedirs(INTERMEDIATE_FILE_PATH)

## Generate corpus

In [None]:
#Generation corpus from the file
def generate_corpus(language, method, len_sen = 10, over = 4, sampling = 1 ) :
    """
    Generate a corpus from the dmzo dataset with documents and summaries.
    
    :param method:      String referencing a tokenize method.
                        'nltk'    ->
                        'brutal'  ->
                        'overlap' ->
                        Default is nltk.
                        
    :param len_sen:     Number of words in a sentence.
                        Used by the 'brutal' and 'overlap' tokenizer.
                        
    :param over:        ??? Someting used by the 'overlap' tokenizer.
    
    :param sampling:    Threshold. Float. Must be between 0.0 and 1.0
                        For each document in the data set, a random number
                        is drawn (between 0 and 1). If smaller than the
                        threshold, the document is kept in the final corpus.
                        Else, it's discarded.
                        
    :return:    docs: Dictionary mapping string to a string.
                      Maps a docset + docid to a parsed and tokenized document.
                golden_summaries: Dictionary mapping a string to a dictionary.
                      Maps a docset + docid to multiple parsed and tokenized summaries.
                overall: Dictionary
    """    
    corpus = {}
    with open(DOCS_DIR) as file :
        line = file.readline()
        i = 0
        while line :
            key, data = line[:-1].split("\t")
            if random.random() < sampling:
                corpus[key] = data
            line = file.readline()
            i+=1

    print("Loading done")

    #Sentence tokenizing part
    tokenizer = Tokenizer(language, method, len_sen, over, min_doc_len=3)
    {}
    docs = run_pipeline(doc_corpus, _doc_wrapper, {"tokenizer": tokenizer})

    #Cleaning part
    #docs = {k : docs[k] for k in docs if len(docs[k])> 3}
    

    #Summaries generator
    summary_corpus = {}
    #stops = set(stopwords.words('french'))
    stops = set()
    with open(SUMMARIES_DIR) as file :
        line = file.readline()
        i = 0
        while line :
            key, data = line[:-1].split("\t")
            datac = " ".join(set(data.split()) - stops)
            summary_corpus[key] = { "m" : datac }
            line = file.readline()
            i+=1
    
    gold_tokenized_summaries = {x : tokenizer_cleaner(summary_corpus[x]) for x in summary_corpus }
    gold_tokenized_summaries = {x : gold_tokenized_summaries[x] for x in gold_tokenized_summaries
                               if all(len(gold_tokenized_summaries[x][a])>0 for a in gold_tokenized_summaries[x])
                               }

    #Linking summaries and corpus
    overall = {x : "" for x in set(docs.keys()).intersection(gold_tokenized_summaries.keys())}
    
    return docs, gold_tokenized_summaries, overall