# Utility fonctions to prepare the dmoz html dataset

## Imports

In [None]:
import os
import re
import pickle
import spacy
import matplotlib as mpl
import matplotlib.pyplot as plt
import unicodedata

from collections import defaultdict
from bs4 import BeautifulSoup

# Global variables

In [None]:
# Maximum nb of process for the multiprocessing Pool.
MAX_PROCESSES = 10 # 15 ok if I'm not fuzzing around with other notebooks.

## Global path (datasets and intermediate data)

In [None]:
# Path to data folder
DATA_PATH = "./data"

# Path to the dataset
DATASET_PATH= os.path.join(DATA_PATH, "crawl-dmoz-fr-100000-html/")

# Path to directory in which intermediate data will be stored
INTERMEDIATE_FILE_PATH= os.path.join(DATA_PATH, "dmoz-html-intermediate/")

if not os.path.exists(INTERMEDIATE_FILE_PATH):
    os.makedirs(INTERMEDIATE_FILE_PATH)

In [None]:
def get_url(parsed_doc):
    s = parsed_doc.find("link", {"rel" : "canonical"})
    url = None
    if s is not None:
        regex = re.search("href=\"https?://([^\"]+)\"", str(s))
        if regex:
            url = regex.group(1)
#           urls.append(url)
    return url

## Gold summary load utils

In [None]:
def load_gold_dmoz_html(bin_path=os.path.join(DATA_PATH, "data.p")):
    gold_sum_dict = pickle.load(open(bin_path, 'rb'))
    # Remove entries (url keys) with empty description.
    print("[DMOZ HTML][GOLD LOAD] Total of gold summaries loaded %d" % len(gold_sum_dict.keys()))
    gold_sum_dict = { url: gold for url, gold in gold_sum_dict.items() if gold != '' and gold is not None}
    print("[DMOZ HTML][GOLD LOAD] Total of non empty gold summaries loaded %d" % len(gold_sum_dict.keys()))
    return gold_sum_dict

## Document parsing

### Parse html pages

In [None]:
def parse_doc_dmoz_html(doc_folder=DATASET_PATH, sampling=1, part_id=None):
    """
    Parse the html dmoz documents. Dataset is split in part, holding multiple web pages.
    Reads each part and parse the pages assuming they are delimited by <html> ... </html> tags.
    
    :param doc_folder:   Path to the dataset directory holding html parts.
    
    :return:   Array of html web pages (strings).
    """

    files = [file for file in os.listdir(doc_folder) if bool(re.match(r'part-[0-9]+', file))]
    html_list = []
        
    # Compute nb part to keep regarding the sampling param.
    # We assume parts are approximately of the same sizes.
    tot = len(files)
    perc = sampling * len(files)
    print("[DMOZ HTML][DOC PARSE] Loading %d / %d parts" % (perc, tot))
    
    for file_name in files:
        if perc <= 0:
            break
            
        # For each part, parse html pages
        filepath = os.path.join(doc_folder, file_name)
        print("[DMOZ HTML][DOC PARSE] Parsing %s" % filepath)
        file = open(filepath, 'r', encoding='utf-8')
        html_list += re.findall(r'<html[^>]*>.*?<\/html>', file.read(), re.DOTALL)
        
        perc -= 1

    print("[DMOZ HTML][DOC PARSE] Total of html page loaded %d" % len(html_list))
    return html_list

### Parse raw text from html and segments of interests

In [None]:
stop_tags = ["script", "a"]
#["div", "p", "body", "html", "table", "tr", "li", "ul", "td"]

# Maybe remove strong
interesting_tags = ["h1", "title", "bold", "b", "i", "em", "mark"]

In [None]:
# Define the dictionary of weights
def get_html_bias():
    html_bias = dict(zip(interesting_tags, [0.8] * len(interesting_tags)))
    return html_bias

In [None]:
def clean_soup(soup_doc):
    """
    Remove tags not holding any text information.
    """
    for t in stop_tags:
        to_remove = soup_doc.find_all(t)
        for t in to_remove:
            t.extract()

def get_interesting_segments(soup_doc):
    """
    Builds a dictionay mapping tags of interest to text segments.
    """
    doc_segments = dict()
    for t in interesting_tags:
        tag_list = soup_doc.find_all(t)
        doc_segments[t] = [s for tag in tag_list for s in tag.stripped_strings]
    return doc_segments

def extract_text_and_segments(soup_list):
    """
    Extract from the documents the text (content). And the segments
    surrounded by tags of interrest.
    All other html information are discarded after this step.
    """
    docs = defaultdict(dict)
    segments = defaultdict(dict)
    for soup_doc in soup_list:
        url = get_url(soup_doc)
        if url is None:
            continue
        clean_soup(soup_doc)
        segments[url] = get_interesting_segments(soup_doc)
        docs[url] = [s for s in soup_doc.stripped_strings]
    return docs, segments

## Spacy

In [None]:
sp = None
def get_spacy_model(language):
    global sp
    if sp is not None:
        return sp
    if language == "french":
        sp = spacy.load("fr_core_news_sm")
    elif language == "english":
        sp = spacy.load('en_core_web_sm')
    else:
        print("Unknown spacy language %s" % language)
    return sp

# PipeBlock

* Generalize for all dataset a pipeline of processes.
* Easier to multiprocess.

=> Base "abstract" classe

In [None]:
from abc import abstractmethod #abstract base class

class PipeBlock():
    @abstractmethod
    def __init__():
        ...
        
    @abstractmethod
    def __call__(self, doc):
        ...

## Text normalisation

Define interface inorder to multiprocess the full pipeline.

### Tokenization

In [None]:
def generic_cleaner(sentence) : 
    #Method used to cleand a sentence of all diacritics all characters likes these.
    return re.sub(r"\s+"," ", 
                  re.sub(r"[^a-zA-Z0-9]"," ",
                  unicodedata.normalize('NFKD', sentence).encode('ASCII', 'ignore').decode("utf-8")
                )
             ).lower().strip()

def tokenizer_cleaner(doc, language='french') :
    # Method to create cleaned sentences with nltk
    sentences = sent_tokenize(doc, language=language)
    cleaned = []
    for sen in sentences:
        cleaned_sen = generic_cleaner(sen)
        if len(cleaned_sen.split()) > 1:
            cleaned.append(cleaned_sen)
    #tok_doc = (sentences, cleaned)
    tok_doc = cleaned
    return tok_doc

def brutal_tokenizer(doc, n) :
    #Create sentences by cutting the document in portions of n words
    toks = generic_cleaner(doc).split(" ")
    sentences = [" ".join(toks[x*n:x*n+n]) for x in range(len(toks)//n)]
    cleaned = sentences
    #tok_doc = (sentences, cleaned)
    tok_doc = cleaned
    return tok_doc

def overlap_tokenizer(doc, block_size, over_window) :
    #Create sentences by cutting the document in portions of n words
    toks = generic_cleaner(doc).split(" ")
    sentences = []
    if len(toks) >= block_size :
        sentences = [" ".join(toks[x*over_window:x*over_window+block_size+1])
                     for x in range( (len(toks)-block_size)//over_window+1)]
    cleaned = sentences
    #tok_doc = (sentences, cleaned,(block_size, over_window))
    tok_doc = cleaned
    return tok_doc

def spacy_tokenizer(doc, language):
    """
    Wrapper around the spacy tokenizer.
    Adapts it to the corpus dictionay structure.
    """
    sp = get_spacy_model(language)
    cleaned_doc = [generic_cleaner(sent) for sent in doc]
    cleaned_doc = [sp(sent) for sent in cleaned_doc]
    cleaned_doc = [sent for sent in cleaned_doc if len(sent) > 1]
    #sp_doc = [sp(sent) for sent in doc]
    #tok_doc = (sp_doc, cleaned_doc)
    tok_doc = cleaned_doc
    return tok_doc

class Tokenizer():
    def __init__(self, method='spacy', language="french", len_sen=10, over=4):
        self.method = method
        self.language = language
        self.len_sen = len_sen
        self.over = over

    def __call__(self, doc):
        """
        Tokenize documents.
        """
        #Sentence Tokenization of the corpus
        if self.method == 'nltk':
            tokenized_doc = tokenizer_cleaner(doc)
        elif self.method == 'brutal':
            tokenized_doc = brutal_tokenizer(doc, self.len_sen)
        elif self.method == 'overlap':
            tokenized_doc = overlap_tokenizer(doc, self.len_sen, self.over)
        elif self.method == 'spacy':
            tokenized_doc = spacy_tokenizer(doc, self.language)
        else :
            print("Tokenizer method not accepted: %s" % self.method)
        return tokenized_doc

In [None]:
class RemoveStopWords(PipeBlock):
    def __init__(self, method, language):
        if method == 'spacy':
            sp = get_spacy_model(language)
            spacy_model = spacy.lang.fr if language == "french" else spacy.lang.en
            self.stop_words = spacy_model.stop_words.STOP_WORDS
        elif method == 'nltk':
            self.stop_words = nltk.corpus.stopwords.words(language)
        else:
            print("StopWords method not accepted: %s" % self.method)
        assert(self.stop_words is not None)
        
    def __call__(self, doc):
        doc_res = []
        for sent in doc:
            sent_tmp = [w.string for w in sent if not w.is_stop]
            doc_res.append(" ".join(sent_tmp))
        return doc_res

In [None]:
class Lemmer(PipeBlock):
    def __init__(self, method='spacy', language="french"):
        self.method = method
        self.language = language
    
    def __call__(self, docs):
        """
        Bla.
        """
        if self.method == "spacy":
            lemmed_docs = spacy_lemmer(docs, self.language)
        else:
            print("Lemmer method not accepted: %s" % self.method)
        return lemmed_docs
            

## Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

class Tfidf(PipeBlock):
    def __init__(self, corpus, language):
        stop_w = spacy.lang.fr.stop_words.STOP_WORDS if language == "french" else spacy.lang.en.stop_words.STOP_WORDS
        self.vectorizer = TfidfVectorizer(stop_words=stop_w)
        self.vectorizer.fit(corpus)
        
    def __call__(self, doc):
        return self.vectorizer.transform(doc)
    
    def get_vocabulary(self):
        return self.vectorizer.vocabulary_

## Bias on vector representation

Define a class representing the bias for the corpus.

In [None]:
def _build_vocab_doc_bias(doc_url, tag_map, bias_weight, vocab):
    for tag, tokens in tag_map.items():
        if (len(tokens) == 0):
            continue
        tokens = " ".join(tokens).split()
        doc_vocab_bias = { vocab[word] : bias_weight[tag] for word in tokens if word in vocab.values()}
        return (doc_url, tag_map)

def build_vocab_bias(vocab, doc_bias_terms, bias_weight, vocab_bias_file = None):
    """

    :param vocab:           Dictionnary mapping a word to feature indices.
    :param doc_bias_terms:  Dictionnay mapping a document key to a bias element key
                            (ex html tag of interests). bias element key mapping to words,
                            as they appear in the vocabulary.
                            <=> segments quoi
    :param bias_weight: Dictionnary mapping a bias element to its values (weight).
    """
    if vocab_bias_file is not None and os.path.exists(vocab_bias_file):
        return pickle.load(open(vocab_bias_file, 'rb'))
    
    vocab_bias = multip_for(_build_vocab_doc_bias, doc_bias_terms, (bias_weight, vocab))
    
    if vocab_bias_file is not None and not os.path.exists(vocab_bias_file):
        pickle.dump(vocab_bias, open(vocab_bias_file, 'wb'))
        
    return dict(vocab_bias)
    

def _apply_vocab_doc_bias(doc_url, doc, vocab_bias):
    """
    Given a vector representation of the document (each sentence is a vector of the vocab size), 
    adds to a word its bias weight.
    """
    for word_id, weigth in vocab_bias[doc_url].items():
        if doc[word_id] != 0:
            doc[word_id] += weight
    # Normalisation ??
    return (doc_url, doc)

def apply_vocab_bias(docs, vocab_bias):
    """

    :param vocab:           Dictionnary mapping a word to feature indices.
    :param doc_bias_terms:  Dictionnay mapping a document key to a bias element key
                            (ex html tag of interests). bias element key mapping to words,
                            as they appear in the vocabulary.
                            <=> segments quoi
    :param bias_weight: Dictionnary mapping a bias element to its values (weight).
    """
    
    docs = multip_for(_apply_vocab_doc_bias, docs, [vocab_bias])
        
    return dict(docs)

### Build overall dictionary

Keeps track of document and its summary.

In [None]:
def make_overall_dmoz_html(docs, gold_sum_dict):
    """
    Keep document for which an url is found in the gold summary dictionay.
    """
    overall = {x : "" for x in set(docs.keys()).intersection(gold_sum_dict.keys())}
    print(len(gold_sum_dict.keys()), len(docs.keys()), len(overall.keys()))
    # True if 100% of the dataset is used
    # assert(len(gold_sum_dict.keys()) == len(overall.keys()))
    return overall
        

## Multiprocessing

In [None]:
from multiprocessing import Pool

def _segments_wrapper(doc_url, segments, pipeline):
    for name, transform in pipeline.items():
        if name == "vectorizer":
            break
        for tag, seg in segments.items():
            segments[tag] = transform(seg)
    return (doc_url, segments)

def _doc_wrapper(doc_url, doc, pipeline):
    for name, transform in pipeline.items():
        doc = transform(doc)
    return (doc_url, doc)

def run_pipeline(corpus, wrapper, pipeline=None, verbose=0):
    """
    Loop over the dataset.
    On each document, apply the transformation of each block in pipeline (in order).
    
    :param corpus:   Dictionary mapping a doc_key to its content (array of strings, 1 string = 1 sentence)
    :param wrapper:  The wrapper is a function to adapt the behavior of the block to the dataset shape/structure.
    :param pipeline: If none, u'r looking for troubles my friend.
                     An array of actions to perform on a document of the corpus.
                     Must follow the PipeBlock implementation.
    """
    with Pool(MAX_PROCESSES) as ps:
        corpus_res = ps.starmap(wrapper, [(doc_url, doc_sents, pipeline) for doc_url, doc_sents in corpus.items()])
        corpus = dict(corpus_res)
    return corpus


def multip_for(func, it_dict, args):
    """
    Loop over the dataset.
    On each document, apply the transformation of each block in pipeline (in order).
    
    :param corpus:   Dictionary mapping a doc_key to its content (array of strings, 1 string = 1 sentence)
    :param wrapper:  The wrapper is a function to adapt the behavior of the block to the dataset shape/structure.
    :param pipeline: If none, u'r looking for troubles my friend.
                     An array of actions to perform on a document of the corpus.
                     Must follow the PipeBlock implementation.
    """
    with Pool(MAX_PROCESSES) as ps:
        res = ps.starmap(func, [(key, value, *args) for key, value in it_dict.items()])
    return res
    

### Pull it all together for the Resume_Interface

In [None]:
def generate_corpus_dmoz_html(method='spacy', len_sen=10, over=4, sampling=1):
    """
    Generate a corpus from the dmzo dataset with documents and summaries.
    
    :param method:      String referencing a tokenize method.
                        'nltk'    ->
                        'brutal'  ->
                        'overlap' ->
                        Default is nltk.
                        
    :param len_sen:     Number of words in a sentence.
                        Used by the 'brutal' and 'overlap' tokenizer.
                        
    :param over:        ??? Someting used by the 'overlap' tokenizer.
    
    :param sampling:    Threshold. Float. Must be between 0.0 and 1.0
                        For each document in the data set, a random number
                        is drawn (between 0 and 1). If smaller than the
                        threshold, the document is kept in the final corpus.
                        Else, it's discarded.
                        
    :return:    docs: Dictionary mapping string to a string.
                      Maps a docset + docid to a parsed and tokenized document.
                gold_summaries: Dictionary mapping a string to a dictionary.
                      Maps a docset + docid to multiple parsed and tokenized summaries.
                overall: Dictionary
    """


    # Load gold summaries
    gold_sum_dict = load_gold_dmoz_html()

    # Load html dmoz documents
    html_list = parse_doc_dmoz_html(sampling=sampling)

    # Parse html dmoz documents and dump the parsed tree to bin file.
    # Sadly, this task cannot be executed on multiple processes. The html
    # tree generated is waaaaay too big to be serialized and send back to the
    # main process. It might be worth to spend some time finding a trick
    # solving this issue.
    print("[DMOZ HTML][SOUP PARSE] Parsing %d docs with soup." % len(html_list))
    soup_list = [BeautifulSoup(html_doc) for html_doc in html_list]
    print("[DMOZ HTML][SOUP PARSE] Done")
    
    
    # extract document's text and segments of interests
    # All the text is extracted from the document.
    # TODO: See Readability.js to strengthen the quality / choice of the text extracted.
    docs, segments = extract_text_and_segments(soup_list)
    
    # Build overall
    print("[DMOZ HTML][OverALL] Starting")
    overall = make_overall_dmoz_html(docs, gold_sum_dict)
    print("[DMOZ HTML][OverALL] Done")
    

    # Apply preprocessing to text
    # Corpus necessary for tfidf fit.
    corpus = [" ".join(doc) for doc in docs.values()]
    print("[DMOZ HTML][PREP] Preprocessing pipeline")
    
    # PCA on vocabulary to reduce its size ?
    normalisation_pipeline = dict([
        ("tokenizer", Tokenizer(method="spacy", language="french"))
        ,("stopwords", RemoveStopWords(method="spacy", language="french"))
        #("lemmer", Lemmer(method="spacy", language="french"))
        ,("vectorizer", Tfidf(corpus, language="french"))
    ])
    
    # Segment preprocessing
    print("[DMOZ HTML][PREP] segments")    
    segments = run_pipeline(segments, _segments_wrapper, normalisation_pipeline)
    
    # Preprocess the document using tfidf biaised vocab.
    # As tfidf return a normalized result, we don't have to worry
    # to much on the impact of the bias on the vector representation.
    print("[DMOZ HTML][PREP] docs")    
    docs = run_pipeline(docs, _doc_wrapper, normalisation_pipeline)
    print("[DMOZ HTML][PREP] Done")

    # Build vocab bias matrix and apply the bias to the TFIDF vocab.
    print("[DMOZ HTML][Bias] Building vocab bias for each document")
    vocab = normalisation_pipeline["vectorizer"].get_vocabulary()
    vocab_bias_file = os.path.join(INTERMEDIATE_FILE_PATH, "vocab_bias_part-00068")
    # Normalization after adding bias
    vocab_bias = build_vocab_bias(vocab, segments, get_html_bias(), vocab_bias_file)
    print("[DMOZ HTML][Bias] Applying biais to documents")
    docs = apply_vocab_bias(docs, vocab_bias)
    #docs = apply_bias(docs, vocab_bias)
    print("[DMOZ HTML][Bias] Done")
    
    # Return value following other dataset generation.
    return docs, gold_sum_dict, overall

In [None]:
docs, gold_sum_dict, overall = generate_corpus_dmoz_html(sampling=0.01)

## Document analysis

In [None]:
def get_tag_frequency(html_list):
    dic = {}

    for html in html_list:
        test = re.findall(r'<[^!\</\-\?][^>/\n:]*>', html, re.DOTALL)
        for balise in test:
            if " " in balise:
                balise = balise.split(" ")[0] + ">"

            if not balise in dic:
                dic[balise] = 1
            else:
                dic[balise] += 1
    return dic()

In [None]:
def plot_tag_proportions(dic):
    mpl.rcParams['font.size'] = 9.0

    #plt.figure(figsize=(30,20))
    plt.pie(dic.values(), labels=dic.keys(), autopct='%1.1f%%')
    plt.savefig('pie.png')
    plt.show()