# utility fonctions for parsing the dmoz html dataset

## Imports

In [23]:
import os
import re
import random
import pickle
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

## Global path (datasets and intermediate data)

In [24]:
# Path to the dataset
DATASET_PATH="data/crawl-dmoz-fr-100000-html/"

# Path to directory in which intermediate data will be stored
INTERMEDIATE_FILE_PATH="data/dmoz-html-intermediate/"

if not os.path.exists(INTERMEDIATE_FILE_PATH):
    os.makedirs(INTERMEDIATE_FILE_PATH)

In [25]:
def get_url(parsed_doc):
    s = parsed_doc.find("link", {"rel" : "canonical"})
    url = None
    if s is not None:
        regex = re.search("href=\"https?://([^\"]+)\"", str(s))
        if regex:
            url = regex.group(1)
#           urls.append(url)
    return url

## Gold summary load utils

In [26]:
def load_gold_dmoz_html(bin_path="data/data.p"):
    gold_sum_dict = pickle.load(open(bin_path, 'rb'))
    # Remove entries (url keys) with empty description.
    print("[DMOZ HTML][GOLD LOAD] Total of gold summaries loaded %d" % len(gold_sum_dict.keys()))
    gold_sum_dict = { url: gold for url, gold in gold_sum_dict.items() if gold != '' and gold is not None}
    print("[DMOZ HTML][GOLD LOAD] Total of non empty gold summaries loaded %d" % len(gold_sum_dict.keys()))
    return gold_sum_dict

## Document parsing

In [40]:
def parse_part_dmoz_html(filepath):
    """
    FIXME
    """
    file = open(filepath, 'r', encoding='utf-8')
    html_part_list = re.findall(r'<html[^>]*>.*?<\/html>', file.read(), re.DOTALL)
    return html_part_list

stop_tags = ["div", "p", "body", "html", "table", "tr", "li", "ul", "td"]

interesting_tags = [""]

def clean_soup(soup_doc):
    soup_doc.replace_all()
    


def pars_doc_dmoz_html(doc_folder=DATASET_PATH):
    """
    Parse the dmoz documents. Parses is done in 2 steps:
        (1) Each part holds multiple web pages. First parse each pages from the part.
        (2) Parse the html page with BeautifulSoup parser.
        
    The results of those 2 parsing are stored in intermediate files.
    If the file already exists for the part, no parsing are executed.
    
    :param doc_folder:   Path to the dataset directory holding html parts.
    
    :return:   String. Path to the parsed documents.
    """

    files = [file for file in os.listdir(doc_folder) if bool(re.match(r'part-[0-9]+', file))]

    # Checks that directory to store soup files exists.
    intermediate_soup_path = os.path.join(INTERMEDIATE_FILE_PATH, 'soup')
    
    if not os.path.exists(intermediate_soup_path):
        os.makedirs(intermediate_soup_path)
    
    for file_name in files:
        
        # If the parsed part already exists, skip the parsing.
        soup_part_path = os.path.join(intermediate_soup_path, file_name)
        if os.path.exists(soup_part_path):
            continue
            
        # For each part, parse html pages
        print("[DMOZ HTML][DOC PARSE] Parsing %s" % filepath)
        filepath = os.path.join(doc_folder, file_name)
        html_part_list = parse_part_dmoz_html(filepath)

        # Parse html dmoz documents and dump the parsed tree to bin file
        print("[DMOZ HTML][SOUP PARSE] Parsing %s" % filepath)
        soup_part_list = [BeautifulSoup(html_doc) for html_doc in html_part_list]
        # pickle.dump(soup_part_list, open(soup_part_path, "wb"))
        
        # Clean useless tags
        soup_part_list = [clean_soup(soup_doc) for soup_doc in soup_part_list]

    # print("[DMOZ HTML][DOC LOAD] Total of html page loaded %d" % len(html_list))
    return intermediate_soup_path

In [28]:
def load_doc_dmoz_soup(soup_folder, sampling=1):
    """
    FIXME
    """
    files = [file for file in os.listdir(soup_folder) if bool(re.match(r'part-[0-9]+', file))]
    soup_list = []
    
    # Compute nb part to keep regarding the sampling param.
    # We assume parts are approximately of the same sizes.
    tot = len(files)
    perc = sampling * len(files)
    print("[DMOZ HTML][SOUP LOAD] Loading %d / %d parts" % (perc, tot))
    
    for file_name in files:   
        if perc <= 0:
            break
            
        # For each part, load the parsed html
        filepath = os.path.join(soup_folder, file_name)
        soup_list += pickle.load(open(filepath, "rb"))
        
        perc -= 1
            
    return soup_list

In [29]:
def make_overall_dmoz_html(soup_list, gold_sum_dict):
    """
    Keep document for which an url is found in the gold summary dictionay.
    """
    docs = { get_url(soup_doc) : soup_doc for soup_doc in soup_list}
    overall = {x : "" for x in set(docs.keys()).intersection(gold_sum_dict.keys())}
    print(len(gold_sum_dict.keys()), len(docs.keys()), len(overall.keys()))
    # True if 100% of the dataset is used
    # assert(len(gold_sum_dict.keys()) == len(overall.keys()))
    return docs, overall
    # assert(len(html_list) == soup_list)
        

In [30]:
def build_text_doc(soup_doc, w):
    print(soup_doc.get_text())
    print(soup_doc.stripped_strings)
    # List of string. Each string represent text contained within tag(s)
    #text_doc = []
    # tag(s) related to the text.
    #tag_doc = []
    # HTML tree traversal


def apply_weighting_doc_dmoz_html(docs, w):
    """
    Annotate the html document with html tag weight matrix.
    :param doc_list:
    :param w:          Dictionary. Maps a html tag to its weight.
    """
    #docs = {url: build_text_doc(doc) for url, doc in docs.items()}
    for soup_doc in soup_list:
        text_doc = build_text_doc(soup_doc, w)
        break

In [31]:
def tokenize_dmoz_html_corpus(docs, method='nltk', len_sen=10, over=4):
    """
    Tokenize documents.
    """
    #Sentence Tokenization of the corpus
    if method == 'nltk' :
        tokenized_docs = tokenizer_cleaner(docs)
    elif method == 'brutal' :
        tokenized_docs = brutal_tokenizer(docs, len_sen)
    elif method == 'overlap' :
        tokenized_docs = overlap_tokenizer(docs, len_sen, over)
    else :
        print("Method not accepted: %s" % method)
    return tokenized_docs

In [32]:
def generate_corpus_dmoz_html(method='nltk', len_sen=10, over=4, sampling=1):
    """
    Generate a corpus from the dmzo dataset with documents and summaries.
    
    :param method:      String referencing a tokenize method.
                        'nltk'    ->
                        'brutal'  ->
                        'overlap' ->
                        Default is nltk.
                        
    :param len_sen:     Number of words in a sentence.
                        Used by the 'brutal' and 'overlap' tokenizer.
                        
    :param over:        ??? Someting used by the 'overlap' tokenizer.
    
    :param sampling:    Threshold. Float. Must be between 0.0 and 1.0
                        For each document in the data set, a random number
                        is drawn (between 0 and 1). If smaller than the
                        threshold, the document is kept in the final corpus.
                        Else, it's discarded.
                        
    :return:    docs: Dictionary mapping string to a string.
                      Maps a docset + docid to a parsed and tokenized document.
                gold_summaries: Dictionary mapping a string to a dictionary.
                      Maps a docset + docid to multiple parsed and tokenized summaries.
                overall: Dictionary
    """

In [33]:
# Load gold summaries
gold_sum_dict = load_gold_dmoz_html()

[DMOZ HTML][GOLD LOAD] Total of gold summaries loaded 26107
[DMOZ HTML][GOLD LOAD] Total of non empty gold summaries loaded 18272


In [38]:
# Load html dmoz documents
soup_dir = parse_doc_dmoz_html()

[DMOZ HTML][DOC LOAD] Parsing data/crawl-dmoz-fr-100000-html/part-00000


RecursionError: maximum recursion depth exceeded

In [None]:
soup_list = load_doc_dmoz_soup(sampling=0.1)

In [None]:
docs, overall = make_overall_dmoz_html(soup_list, gold_sum_dict)

In [None]:
apply_weighting_doc_dmoz_html(docs)

## Document analysis

In [None]:
def get_tag_frequency(html_list):
    dic = {}

    for html in html_list:
        test = re.findall(r'<[^!\</\-\?][^>/\n:]*>', html, re.DOTALL)
        for balise in test:
            if " " in balise:
                balise = balise.split(" ")[0] + ">"

            if not balise in dic:
                dic[balise] = 1
            else:
                dic[balise] += 1
    return dic()

In [None]:
def plot_tag_proportions(dic):
    mpl.rcParams['font.size'] = 9.0

    #plt.figure(figsize=(30,20))
    plt.pie(dic.values(), labels=dic.keys(), autopct='%1.1f%%')
    plt.savefig('pie.png')
    plt.show()