In [1]:
%run ../utils/dataset_building.py

In [None]:
# change it to True if you want to extract the sections from the 2017 dump in order to reproduce
# results from 2017 paper by computing category section counts instead of using the provided ones
reproducion_2017=False

In [2]:
import subprocess
import html
import re

In [12]:
all_articles=get_all_articles("../data/article_categories_filtered.tsv")

	Getting all article ids...


In [3]:
regex_sections = re.compile(r"\<h2\>.+\<\/h2\>")
regex_subsections=re.compile(r"\<h.\>.+\<\/h.\>")
regex_li_without_point=re.compile(r"([^\.])<\/li\>")
regex_dd_without_point=re.compile(r"([^\.])<\/dd\>")
regex_html_tags=re.compile(r"\<.+?\>")
regex_multiple_spaces=re.compile(r"\s{2,}")
regex_wikipedia_template =re.compile(r"\{.+?\}")
regex_remaining_links_replace=re.compile(r"\[\[.+\|(.+)\]\]")
# source: https://stackoverflow.com/a/3809435
regex_http_links=re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/=]*)")
regex_remaining_square_brackets=re.compile(r"\[\[|\]\]")
regex_remaining_images=re.compile(r"\[\[File:.+\]\]")

def clean_text(text):
    # remove remaining html tags
    cleaned_text=re.sub(regex_html_tags,"",text)
    # remove remaining wikipedia templates, e.g. "{{flagicon|Canada}} Canada" => "Canada"
    cleaned_text=re.sub(regex_wikipedia_template,"",cleaned_text)
    # remove remaining wikipedia links in the form [[a|b]] and replace it with "b"
    cleaned_text=re.sub(regex_remaining_links_replace,r"\1",cleaned_text)
    # remove http links
    cleaned_text=re.sub(regex_http_links,"",cleaned_text)
    # remove remaining square brackets
    cleaned_text=re.sub(regex_remaining_square_brackets,"",cleaned_text)
    # strangly, even with html.unescape, there were still some "&nbsp;" left, therefore we replace
    # them with whitespaces
    cleaned_text=cleaned_text.replace("&nbsp;"," ")
    # replace multiple spaces inducted by lines where there was only a html tag without any text (e.g. <ul>)
    cleaned_text=re.sub(regex_multiple_spaces,r" ",cleaned_text)
    
    return cleaned_text

def extract_section_content(text):
    sections=re.findall(regex_sections,text)
    
    contents=re.split(regex_sections,text)
    
    section_contents=[]
    # there is always a paragraph before the first section header, therefore we skip the first element of contents
    for section,content in zip(sections,contents[1:]):
        cleaned_section=section.replace("<h2>","").replace("</h2>","")
        cleaned_section=clean_text(cleaned_section)
        cleaned_section=cleaned_section.strip()
        if len(cleaned_section)==0:
            continue
        
        # in wikiextractor's output, the first line of each section is the name of the section with
        # "<h2>" tags around, and the second line is the name of the section with a "." at the end,
        # therefore we remove it
        # additionnally, wikiextractor attempts to split sentences by separating them with newlines,
        # we ignore it because we will split the text in sentences ourselves afterwards
        cleaned_content=" ".join(content.splitlines()[2:])
        # remove all subsections with their hX tags, because these are duplicated without hX tags,
        # e.g. "<h3>Subsection</h3>" is followed by "Subsection" on the line afterwards
        cleaned_content=re.sub(regex_subsections,"",cleaned_content)
        # if at the end of a list item, there is no ".", add it
        cleaned_content=re.sub(regex_li_without_point,r"\1.</li>",cleaned_content)
        cleaned_content=re.sub(regex_dd_without_point,r"\1.</dd>",cleaned_content)
        cleaned_content=re.sub(regex_remaining_images,"",cleaned_content)
        
        cleaned_content=clean_text(cleaned_content)

        
        section_contents.append({'section':cleaned_section,'content':cleaned_content})
    
    return section_contents

In [5]:
# save article section contents
pbar = tqdm(total = len(all_articles))
with open("../data/article_section_contents.json", "a+") as f:
    # https://github.com/attardi/wikiextractor/blob/master/README.md
    for line in subprocess.Popen(['wikiextractor', "../data/enwiki-latest-pages-articles.xml.bz2", '--templates', '../data/wikiextractor_templates', '-o', '-', '-q','--html','--html-safe','HTML_SAFE','--json','--processes','1'],  
                                  stdout = subprocess.PIPE).stdout:
        json_line=json.loads(line)
        article_id=int(json_line['id'])
        if article_id in all_articles:
            article_text=json_line['text']
            article_text=html.unescape(article_text)
            section_contents=extract_section_content(article_text)
            f.write(json.dumps({'article_id':article_id,'section_contents':section_contents})+"\n")
            pbar.update(1)

  0%|          | 0/3643496 [00:00<?, ?it/s]

In [17]:
# save article sections
pbar = tqdm(total = len(all_articles))
with open("../data/article_sections.json", "a+") as f_out:
    with open("../data/article_section_contents.json", "r") as f_in:
        for line in f_in:
            json_line=json.loads(line)
            article_id=json_line['article_id']
            sections=[x['section'] for x in json_line['section_contents']]
            f_out.write(json.dumps({'article_id':article_id,'sections':sections})+"\n")
            pbar.update(1)

  0%|          | 0/3643496 [00:00<?, ?it/s]

In [5]:
# save article sections from 2017 dump for epfl reproduction
if reproducion_2017:
    all_articles_2017=get_all_articles("../data/epfl_paper/article_categories_sept17.tsv")

    pbar = tqdm(total = len(all_articles_2017))
    with open("../data/article_sections_2017.json", "a+") as f:
        # https://github.com/attardi/wikiextractor/blob/master/README.md
        for line in subprocess.Popen(['wikiextractor', "../data/enwiki-20170820-pages-articles.xml.bz2", '--templates', '../data/wikiextractor_templates_2017', '-o', '-', '-q','--html','--html-safe','HTML_SAFE','--json','--processes','1'],  
                                      stdout = subprocess.PIPE).stdout:
            json_line=json.loads(line)
            article_id=int(json_line['id'])
            if article_id in all_articles_2017:
                article_text=json_line['text']
                article_text=html.unescape(article_text)
                section_contents=extract_section_content(article_text)
                f.write(json.dumps({'article_id':article_id,'sections':[x['section'] for x in section_contents]})+"\n")
                pbar.update(1)

  0%|          | 0/5132186 [00:00<?, ?it/s]