# Utility fonctions to prepare the dmoz html dataset

## Imports

In [None]:
import os
import re
import pickle
import spacy
import matplotlib as mpl
import matplotlib.pyplot as plt

from collections import defaultdict
from bs4 import BeautifulSoup

%run Process_Summary.ipynb

## Global path (datasets and intermediate data)

In [None]:
# Path to the dataset
DATASET_PATH="../data/crawl-dmoz-fr-100000-html/"

# Path to directory in which intermediate data will be stored
INTERMEDIATE_FILE_PATH="../data/dmoz-html-intermediate/"

if not os.path.exists(INTERMEDIATE_FILE_PATH):
    os.makedirs(INTERMEDIATE_FILE_PATH)

In [None]:
def get_url(parsed_doc):
    s = parsed_doc.find("link", {"rel" : "canonical"})
    url = None
    if s is not None:
        regex = re.search("href=\"https?://([^\"]+)\"", str(s))
        if regex:
            url = regex.group(1)
#           urls.append(url)
    return url

## Gold summary load utils

In [None]:
def load_gold_dmoz_html(bin_path="../data/data.p"):
    gold_sum_dict = pickle.load(open(bin_path, 'rb'))
    # Remove entries (url keys) with empty description.
    print("[DMOZ HTML][GOLD LOAD] Total of gold summaries loaded %d" % len(gold_sum_dict.keys()))
    gold_sum_dict = { url: gold for url, gold in gold_sum_dict.items() if gold != '' and gold is not None}
    print("[DMOZ HTML][GOLD LOAD] Total of non empty gold summaries loaded %d" % len(gold_sum_dict.keys()))
    return gold_sum_dict

## Document parsing

### Parse html pages

In [None]:
def parse_doc_dmoz_html(doc_folder=DATASET_PATH, sampling=1, part_id=None):
    """
    Parse the html dmoz documents. Dataset is split in part, holding multiple web pages.
    Reads each part and parse the pages assuming they are delimited by <html> ... </html> tags.
    
    :param doc_folder:   Path to the dataset directory holding html parts.
    
    :return:   Array of html web pages (strings).
    """

    files = [file for file in os.listdir(doc_folder) if bool(re.match(r'part-[0-9]+', file))]
    html_list = []
        
    # Compute nb part to keep regarding the sampling param.
    # We assume parts are approximately of the same sizes.
    tot = len(files)
    perc = sampling * len(files)
    print("[DMOZ HTML][DOC PARSE] Loading %d / %d parts" % (perc, tot))
    
    for file_name in files:
        if perc <= 0:
            break
            
        # For each part, parse html pages
        filepath = os.path.join(doc_folder, file_name)
        print("[DMOZ HTML][DOC PARSE] Parsing %s" % filepath)
        file = open(filepath, 'r', encoding='utf-8')
        html_list += re.findall(r'<html[^>]*>.*?<\/html>', file.read(), re.DOTALL)
        
        perc -= 1

    print("[DMOZ HTML][DOC PARSE] Total of html page loaded %d" % len(html_list))
    return html_list

### Parse raw text from html and segments of interests

In [None]:
stop_tags = ["script", "a"]
#["div", "p", "body", "html", "table", "tr", "li", "ul", "td"]

# Maybe remove strong
interesting_tags = ["h1", "title", "bold", "b", "i", "em", "mark", "small"]
it_weights = []

In [None]:
# Define the dictionary of weights
def get_html_bias():
    html_bias = dict(zip(interesting_tags, [0.008] * len(interesting_tags)))
    return html_bias

In [None]:
def clean_soup(soup_doc):
    """
    Remove tags not holding any text information.
    """
    for t in stop_tags:
        to_remove = soup_doc.find_all(t)
        for t in to_remove:
            t.extract()

def get_interesting_segments(soup_doc):
    """
    Builds a dictionay mapping tags of interest to text segments.
    """
    doc_segments = dict()
    for t in interesting_tags:
        tag_list = soup_doc.find_all(t)
        doc_segments[t] = [s for tag in tag_list for s in tag.stripped_strings]
    return doc_segments

def extract_text_and_segments(soup_list):
    """
    Extract from the documents the text (content). And the segments
    surrounded by tags of interrest.
    All other html information are discarded after this step.
    """
    docs = defaultdict(dict)
    segments = defaultdict(dict)
    for soup_doc in soup_list:
        url = get_url(soup_doc)
        if url is None:
            continue
        clean_soup(soup_doc)
        segments[url] = get_interesting_segments(soup_doc)
        docs[url] = [s for s in soup_doc.stripped_strings]
    return docs, segments

### Preprocess the text and segments

* lemmer
* tokenizer

In [None]:
class Stemmer():
    def __init__(self):
        pass
    
    def __call__(self):
        pass
    
class RemoveStopWords():
    def __init__(self, method, language):
        self.method = method
        if method == 'spacy':
            sp = get_spacy_model(language)
            spacy_model = spacy.lang.fr if language == "french" else spacy.lang.en
            self.stop_words = spacy_model.stop_words.STOP_WORDS
        else:
            self.stop_words = nltk.corpus.stopwords.words(language)
        assert(self.stop_words is not None)
        
    def __call__(self, docs):
        if self.method == "spacy":
            return self.spacy_stop_w(docs)
        else:
            return self.nltk_stop_w(docs)
        
    def spacy_stop_w(self, docs):
        #print(self.stop_words)
        docs_res = defaultdict(dict)
        for key, sents in docs.items():
            docs_res[key] = []
            for sent in sents:
                sent_tmp = [w.string for w in sent if not w.is_stop]
                docs_res[key].append(" ".join(sent_tmp))
        return docs_res
    
    def nltk_stop_w(self, docs):
        docs_res = defaultdict(dict)
        for key, sents in docs.items():
            docs_res[key] = []
            for sent in sents:
                sent_tmp = [w for w in sent.split() if w not in self.stop_words]
                docs_res[key].append(" ".join(sent_tmp))
        return docs_res
            
    
class Lemmer():
    def __init__(self, method='spacy', language="french"):
        self.method = method
        self.language = language
    
    def __call__(self, docs):
        """
        Bla.
        """
        if self.method == "spacy":
            lemmed_docs = spacy_lemmer(docs, self.language)
        else:
            print("Lemmer method not accepted: %s" % self.method)
        return lemmed_docs
            


class Tokenizer():
    def __init__(self, method='spacy', language="french", len_sen=10, over=4):
        self.method = method
        self.language = language
        self.len_sen = len_sen
        self.over = over

    def __call__(self, docs):
        """
        Tokenize documents.
        
        :param doc:   array of strings.
        """
        #Sentence Tokenization of the corpus
        if self.method == 'nltk':
            tokenized_docs = tokenizer_cleaner(docs)
        elif self.method == 'brutal':
            tokenized_docs = brutal_tokenizer(docs, self.len_sen)
        elif self.method == 'overlap':
            tokenized_docs = overlap_tokenizer(docs, self.len_sen, self.over)
        elif self.method == 'spacy':
            tokenized_docs = spacy_tokenizer(docs, self.language)
        else :
            print("Tokenizer method not accepted: %s" % self.method)
        return tokenized_docs

def preprocess_text_and_segments(docs, segments, preprocessors):
    for prep in preprocessors:
        docs = prep(docs)
        for url, doc_segments in segments.items():
            segments[url] = prep(doc_segments)
    return docs, segments

### Build overall dictionary

Keeps track of document and its summary.

In [None]:
def make_overall_dmoz_html(docs, gold_sum_dict):
    """
    Keep document for which an url is found in the gold summary dictionay.
    """
    docs_keys = docs.keys()
    gold_keys = gold_sum_dict.keys()
    overall = {x : "" for x in set(docs_keys).intersection(gold_keys)}
    print(len(gold_keys), len(docs_keys), len(overall.keys()))
    # True if 100% of the dataset is used
    # assert(len(gold_sum_dict.keys()) == len(overall.keys()))
    return overall
        

### Pull it all together for the Resume_Interface

In [None]:
def generate_corpus_dmoz_html(method='spacy', len_sen=10, over=4, sampling=1):
    """
    Generate a corpus from the dmzo dataset with documents and summaries.
    
    :param method:      String referencing a tokenize method.
                        'nltk'    ->
                        'brutal'  ->
                        'overlap' ->
                        Default is nltk.
                        
    :param len_sen:     Number of words in a sentence.
                        Used by the 'brutal' and 'overlap' tokenizer.
                        
    :param over:        ??? Someting used by the 'overlap' tokenizer.
    
    :param sampling:    Threshold. Float. Must be between 0.0 and 1.0
                        For each document in the data set, a random number
                        is drawn (between 0 and 1). If smaller than the
                        threshold, the document is kept in the final corpus.
                        Else, it's discarded.
                        
    :return:    docs: Dictionary mapping string to a string.
                      Maps a docset + docid to a parsed and tokenized document.
                gold_summaries: Dictionary mapping a string to a dictionary.
                      Maps a docset + docid to multiple parsed and tokenized summaries.
                overall: Dictionary
    """


    # Load gold summaries
    gold_sum_dict = load_gold_dmoz_html()

    # Load html dmoz documents
    html_list = parse_doc_dmoz_html(sampling=0.01)

    # Parse html dmoz documents and dump the parsed tree to bin file
    print("[DMOZ HTML][SOUP PARSE] Parsing %d docs with soup." % len(html_list))
    soup_list = [BeautifulSoup(html_doc) for html_doc in html_list]
    print("[DMOZ HTML][SOUP PARSE] Done")
    
    print("[DMOZ HTML][SEGMENTS] Starting")
    docs, segments = extract_text_and_segments(soup_list)
    print("[DMOZ HTML][SEGMENTS] Done")

    print("[DMOZ HTML][Normalizer] Starting")
    preprocessors = [
        Tokenizer(method=method, len_sen=len_sen, over=over, language="french")
        , RemoveStopWords(method, language="french")
        #, Lemmer(method="spacy", language="french")
    ]
    
    docs, segments = preprocess_text_and_segments(docs, segments, preprocessors)
    print("[DMOZ HTML][Normalizer] Done")
    
    print("[DMOZ HTML][Overall] Starting")
    overall = make_overall_dmoz_html(docs, gold_sum_dict)
    print("[DMOZ HTML][Overall] Done")

    return docs, gold_sum_dict, overall, segments

## Document analysis

In [None]:
def get_tag_frequency(html_list):
    dic = {}

    for html in html_list:
        test = re.findall(r'<[^!\</\-\?][^>/\n:]*>', html, re.DOTALL)
        for balise in test:
            if " " in balise:
                balise = balise.split(" ")[0] + ">"

            if not balise in dic:
                dic[balise] = 1
            else:
                dic[balise] += 1
    return dic()

In [None]:
def plot_tag_proportions(dic):
    mpl.rcParams['font.size'] = 9.0

    #plt.figure(figsize=(30,20))
    plt.pie(dic.values(), labels=dic.keys(), autopct='%1.1f%%')
    plt.savefig('pie.png')
    plt.show()