In [1]:
%run ../utils/dataset_building.py

In [4]:
nb_articles=len(get_all_articles("../data/article_categories_no_unknown_types.tsv"))

	Getting all article ids...


In [39]:
import html
# some characters were still in the escaped html form, we build a mapping table to be applied with str.replace
def get_escaped_html_mapping_table():
    escaped_html=re.compile(r'&\w+;')
    escaped_html_chars=set()
    with open("../data/article_section_sentences.json", "r") as f_in:
        for line in tqdm(f_in,total=nb_articles):
            json_line=json.loads(line)
            article_id=json_line['article_id']
            section_sentences=json_line['section_sentences']
            section_sentences_ascii=[]

            for section_sentence in section_sentences:
                section=section_sentence['section']
                escaped=set(re.findall(escaped_html,section_sentence['sentence']))
                escaped_html_chars|=escaped
                
    escaped_html_mapping_table={}
    for char in escaped_html_chars:
        unescaped_char=html.unescape(char)
        if unescaped_char!=char:
            escaped_html_mapping_table[char]=unescaped_char
    
    return escaped_html_mapping_table

In [40]:
escaped_html_mapping_table=get_escaped_html_mapping_table()

  0%|          | 0/2048191 [00:00<?, ?it/s]

In [51]:
import unicodedata
import string
import re

# https://stackoverflow.com/a/41516221
# translate "fancy" quotation marks and dashes to ascii quotation marks and dashes, 
# in order to avoid that e.g. "1823–1967" becomes "1823 1967"
transl_table = dict( [ (ord(x), ord(y)) for x,y in zip( u"‘’´“”«»–—−–-",  u"'''\"\"\"\"-----") ] )

regex_multiple_spaces=re.compile(r"\s{2,}")
# eg ( , )
regex_comma_between_parentheses=re.compile(r"\(\s*,\s*\)")
# eg "( , Cha ngwu We iyua nhui )" => "( Cha ngwu We iyua nhui )"
# because non-ascii characters before the comma were removed
regex_text_after_comma_between_parentheses=re.compile(r"\(\s*,(.+)\)")
# some sections had only one "=" around them, but sections should have at least 2 "=",
# therefore those are mistakes because all correct sections were transformed into <hX> html tags by wikiextractor
regex_wrong_sections=re.compile(r"\=[\w\s]+\=")

printable = set(string.printable)

# https://stackoverflow.com/a/517974
# the idea was to replace diacritics from european languages with its ascii form (e.g. "à" => "a"),
# without using a module like unidecode which would translate e.g. arabic or japanese kanji to
# non-existant words in english which are not even phonetically the same as in the original language
# therefore all letters which cannot be easily converted into ascii are removed
def remove_accents(input_str):
    output_str=str(input_str)
    for escaped_char,unescaped_char in escaped_html_mapping_table.items():
        output_str=output_str.replace(escaped_char,unescaped_char)
    output_str=input_str.translate( transl_table )
    nfkd_form = unicodedata.normalize('NFKD', output_str)
    output_str="".join([c if not unicodedata.combining(c) else " " for c in nfkd_form])
    output_str="".join([c if c in printable else " " for c in output_str])
    # some dashes were not correctly unescaped, therefore we need to replace the escaped version with a normal dash
    output_str=output_str.replace("&mdash;","-")
    output_str=re.sub(regex_wrong_sections,r" ",output_str)
    output_str=re.sub(regex_comma_between_parentheses,r" ",output_str)
    output_str=re.sub(regex_text_after_comma_between_parentheses,r"(\1)",output_str)
    output_str=re.sub(regex_multiple_spaces,r" ",output_str)
    output_str=output_str.replace("( )","").replace("()","").replace("( ","(").replace(" )",")")
    output_str=output_str.replace(" , ",", ").replace('" "','').replace(" .",".")
    return output_str 

In [52]:
# wikiextractor didn't managed to remove all tables from articles,
# therefore some sentence contain noise left by badly extracted tables
# we detect and remove those sentences
# e.g. style=\"text-align:center;\" colspan=2| Career !
# e.g. 9 || || 14 ||.377 ||.250 ||.677 || 2.9 || 0.6|| 0.2 ||0.1 || 4.0
def check_if_sentence_contain_table(sentence):
    return "colspan" in sentence or "style=" in sentence or "rowspan" in sentence or "style =" in sentence or "||" in sentence or "<!-" in sentence or 'scope="' in sentence

In [53]:

with open("../data/article_section_sentences_ascii.json", "a+") as f_out:
    with open("../data/article_section_sentences.json", "r") as f_in:
        for line in tqdm(f_in,total=nb_articles):
            json_line=json.loads(line)
            article_id=json_line['article_id']
            section_sentences=json_line['section_sentences']
            section_sentences_ascii=[]

            for section_sentence in section_sentences:
                section=section_sentence['section']
                
                sentence_ascii=remove_accents(section_sentence['sentence'])
                if check_if_sentence_contain_table(sentence_ascii) or sum([c.isalpha() for c in sentence_ascii]) == 0:
                    continue
                section_sentences_ascii.append({'section':section,'sentence':sentence_ascii})

            if len(section_sentences_ascii)>0:
                f_out.write(json.dumps({'article_id':article_id,'section_sentences':section_sentences_ascii})+"\n")

  0%|          | 0/2048191 [00:00<?, ?it/s]