In [1]:
import bz2
from collections import defaultdict, Counter
import os
import xml.etree.ElementTree as etree

import mwparserfromhell

from kg.entity_linking import utils

In [2]:
wiki_dir = '/Users/tmorrill002/Documents/datasets/wikipedia/'

In [3]:
# wiki_dir = '/Users/tmorrill002/Documents/datasets/wikipedia/20210401/'
file = 'enwiki-20210401-pages-articles-multistream1.xml-p1p41242.bz2'

file_path = os.path.join(wiki_dir, file)

**Overview of the parsing process:**
1) etree.iterparse incrementally builds up the XML tree one line at a time
2) the root node stores children nodes underneath it, which correspond to wikipedia pages
3) the 'start' event corresponds to an opening tag (e.g. \<page\>) and the 'end' event corresponds to a closing tag (e.g. \</page\>)
4) the elem.text method gathers all text between the start and end of the element, which is built up incrementally as more lines are parsed
5) the root is cleared once a page is parsed so that there is only one child node under the root at a time (keeps memory footprint low)

In [4]:
def get_context(file_path):
    # open the compressed file
    fp = bz2.BZ2File(file_path, 'r')

    # handles first element in the iteration, which is later cleared to avoid persisting everything in memory
    # get an iterable
    context = etree.iterparse(fp, events=("start", "end"))

    # turn it into an iterator
    context = iter(context)

    # get the root element
    event, root = next(context)
    return context, root

In [5]:
def get_page(context, root):
    extracted_content = {}
    for event, elem in context:
        # clean tags: '{http://www.mediawiki.org/xml/export-0.10/}mediawiki' -> 'mediawiki'
        tag = utils.strip_tag_name(elem.tag)
        
        # capture information when the close tag is read (e.g. <\page>)
        if event == 'end':
            # get text content that has been accumulated
            if tag == 'title':
                extracted_content['title'] = elem.text
            elif tag == 'text':
                extracted_content['text'] = elem.text
            elif tag == 'id':
                extracted_content['id'] = int(elem.text)
            elif tag == 'redirect':
                extracted_content['redirect'] = elem.attrib['title']
            elif tag == 'ns':
                extracted_content['ns'] = int(elem.text)
            # read one complete page
            elif tag == 'page':
                # keep memory footprint low, clear all the children nodes
                root.clear()
                
                return extracted_content
    
    # if nothing left to iterate on, return None
    return None

In [151]:
context, root = get_context(file_path)

In [152]:
page_count = 0
page = True
while page:
    page = get_page(context, root)
    page_count += 1

In [153]:
page_count

27435

In [143]:
# this 
for child in root:
    print({x.tag for x in root.findall(child.tag+"/*")})

**Link extraction specification:**
1. Dictionary mapping surface forms (i.e. anchor text) to entities
    - Key should be anchor text, value should be dictionary, where keys are entities and values are mention counts
1. Article titles should always make it into the dictionary
    - this is important when no one links to a page
    - may want to do some additional cleanup on these (e.g. remove parentheses)
1. Backburner: inverse this dictionary and map entities to surface forms, if needed

Refer to [this post](https://ai.googleblog.com/2012/05/from-words-to-concepts-and-back.html) for more details

Issues to solve for:
1. Extracted text doesn't start and end with 2 open/close brackets
1. More than 1 pipe

In [145]:
def get_entity_anchor_pairs(page):
    """Get pairs of entities and anchors from a page."""
    # create the wiki article
    # grateful for this library!!
    wiki_page = mwparserfromhell.parse(page['text'])

    # find the wikilinks
    wikilinks = [x for x in wiki.filter_wikilinks()]

    entity_anchor_pairs = []
    issues = []
    for link in wikilinks:
        # links look like this: '[[Political movement|movement]]'
        # the first part is the entity (link to another wiki page)
        # the second is the anchor text (i.e. surface form)
        
        if link[:2] != '[[' or link[-2:] != ']]':
            # issue if doens't start with '[[' and end with ']]'
            issues.append(link)
            continue

        trimmed_link = link[2:-2]
        link_parts = trimmed_link.split('|')
        if len(link_parts) > 2:
            # issue if more than 1 pipe
            issues.append(link)
            continue
        
        if len(link_parts) == 1:
            # possible that the link is related to a category e.g. 'Category:Anti-capitalism'
            if link_parts[0].startswith('Category:'):
                # TODO: determine if we want to handle categories differently
                # strip category tag from the surface form
                clean_category = link_parts[0].split('Category:')[-1]
                # e.g. entity, anchor = 'Category:Anti-capitalism', 'Anti-capitalism'
                entity, anchor = link_parts[0], clean_category
            else:
                # i.e. anchor text is the same as the entity name
                entity, anchor = link_parts[0], link_parts[0]
        elif len(link_parts) == 2:
            # expected format
            entity, anchor = link_parts[0], link_parts[1]

        entity_anchor_pairs.append((entity, anchor))
    
    return entity_anchor_pairs

In [146]:
def extract_links(page):
    # anchor_text -> {entity: count, another_entity: count}
    anchor_to_entities = defaultdict(Counter)
    
    # add redirects to dictionary (probably needs some string cleanup, mostly camel case)
    if 'redirect' in page:
        # e.g. page['title'] = 'AfghanistanHistory'
        # page['redirect'] = 'History of Afghanistan'
        anchor_text = page['title']
        entity = page['redirect']
        anchor_to_entities[anchor_text][entity] += 1
        
        # no other links on page, simply return
        return anchor_to_entities
    
    # add the title to the dictionary
    # useful if no other page links to this page
    title = page['title']
    anchor_to_entities[title][title] += 1
    
    # get links and update anchor, entity occurrence counts
    # result will look like:
    # 'Carl Levy': Counter({'Carl Levy (political scientist)': 1})
    # 'capitalism': Counter({'Anarchism and capitalism': 2, 'capitalism': 1})
    entity_anchor_pairs = get_entity_anchor_pairs(page)
    for entity, anchor in entity_anchor_pairs:
        anchor_to_entities[anchor][entity] += 1
    
    return anchor_to_entities

In [147]:
anchor_to_entities_dict = extract_links(page)

In [149]:
anchor_to_entities['capitalism'].most_common()

[('Anarchism and capitalism', 2), ('capitalism', 1)]