In [2]:
import xml.sax
import subprocess
import mwparserfromhell

In [8]:
# Function where ContentHandler looks for opening and closing tags title and text 
# and adds characters enclosed within them to the buffer
# content saved to a dict with tag as key

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):         #do we need timestamp?
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [12]:
data_path = r"/home/mmartinelli/project/corpora/wikidumps/itwiki-20210720-pages-articles-multistream.xml.bz2"
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

lst = []
# Iterating through compressed file
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    
    parser.feed(line)
    # Stop after nth article reached
    if len(handler._pages) > 20000:
        break

In [16]:
# Append all articles in wikified_dishes lst that have the strings defined in categories

wikified_dishes = []
categories = ["Categoria:Antipasti", "Categoria:Contorni", "Categoria:Dolci", "Categoria:Involtini", 
              "Categoria:Piatti unici", "Categoria:Primi piatti", "Categoria:Secondi piatti"]
for x in handler._pages:
    if any(cat in x[1] for cat in categories):
        wikified_dishes.append(x)
    else:
        pass
print(len(wikified_dishes))

        

3
