In [None]:
import os
import json
from tqdm.auto import tqdm

In [None]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [None]:
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

handler._pages

[]

In [None]:
for i, line in tqdm(enumerate(open("/path/to/wiki/pages-articles.xml"))):
    parser.feed(line

In [6]:
len(handler._pages)

2115307

In [7]:
title2page = dict([(x,y) for x,y in tqdm(handler._pages)])


  0%|          | 0/2115307 [00:00<?, ?it/s]

In [None]:
redirects = dict()
for title, page in tqdm(title2page.items(), total=len(title2page)):
    if "#redirect" in page and ":" not in title:
        redirects[title.strip()] = page.split("[[",2)[1].split("]]")[0].strip()
        print(title, "->", redirects[title])

In [9]:
train = json.load(open("../za-data/zac2022_train_merged_final.json"))
entity_dict = dict()
for x in train['data']:
    if x["category"] == "FULL_ANNOTATION" and "wiki/" in x["answer"]:
        entity_dict[x["short_candidate"]] = x["answer"]
for key, val in redirects.items():
    redirects[key] = f"wiki/{val}".replace(" ","_")
entity_dict.update(redirects)

In [10]:
json.dump(entity_dict, open("../processed/entities.json","wt"))