In [1]:
import xml.sax
import subprocess
import mwparserfromhell
import re
import os

In [2]:
# Function where ContentHandler looks for opening and closing tags title and text 
# and adds characters enclosed within them to the buffer
# content saved to a dict with tag as key

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []
        self._counter = 0
        self._flag = True

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'id', 'text', 'timestamp'):         #do we need timestamp?
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            #print(name, self._buffer)
            if self._current_tag == "id": 
                if self._flag:
                    self._values[name] = ' '.join(self._buffer)
                    self._flag = False
            else:
                self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._flag = True
            self._pages.append((self._values['title'], self._values['id'], self._values['text']))
            #print(self._pages[-1])

In [3]:
data_path = r"/home/mmartinelli/project/corpora/wikidumps/enwiki-20210720-pages-articles-multistream.xml.bz2"
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)


counter = 0

# Iterating through compressed file
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    
    parser.feed(line)
    
#    if len(handler._pages) > 70000:
#        break
    counter += 1
    if counter % 10000 == 0:
        print("Current loop:", counter)


In [6]:
# Create new list and categories
# Iterate over dump texts x[2] to find only the categories specified
# Save titles x[0] in a list
# Parse and clean texts
# Join titles and texts

texts = []
categories = ["Category:Italian cuisine", "Category:Cuisine of Abruzzo", 
              "Category:Cuisine of Apulia", "Category:Cuisine of Basilicata", "Category:Cuisine of Calabria",
              "Category:Cuisine of Campania", "Category:Cuisine of Emilia-Romagna", "Category:Cuisine of Lazio", 
              "Category:Cuisine of Liguria", "Category:Cuisine of Lombardy", "Category:Cuisine of Marche",
              "Category:Cuisine of Molise", "Category:Cuisine of Piedmond", "Category:Cuisine of Sardinia",
              "Category:Cuisine of Sicily", "Category:Cuisine of South Tyrol", "Category:Cuisine of Tuscany",
              "Category:Cuisine of Umbria", "Category:Cuisine of Veneto",
              "Category:Cuisine of Aosta Valley", "Category:Dairy dishes", "Category:Egg dishes", 
              "Category:Flower dishes", "Category:Fruit dishes", "Category:Ginger dishes", "Category:Grain dishes", 
              "Category:Meat dishes", "Category:Mushroom dishes", "Category:Noodle dishes", "Category:Nut dishes", 
              "Category:Pasta dishes", "Category:Tofu dishes", "Category:Tuber dishes", "Category:Vegetable dishes"]

for x in handler._pages:
    if any(cat in x[2] for cat in categories):
        texts.append(x)
        titles = [x[0] for x in texts]
        wikified_dishes = [mwparserfromhell.parse(x[2]).strip_code().strip() for x in texts]
        #wikified_dishes.append(texts)
        wikified_dishes = [re.sub(r"(== See also == | ==See also== )\n *(.)*", "", el, flags=re.DOTALL) for el in wikified_dishes]
        wikified_dishes = [re.sub(r"<[^>]+>", "", el) for el in wikified_dishes]
        wikified_dishes = [re.sub(r"(  )*", "", el) for el in wikified_dishes]
        wikified_dishes = ["\n".join(x) for x in zip(titles, wikified_dishes)]
    else:
        pass

#print(len(wikified_dishes))
print("number of wikified dishes: {}".format(len(wikified_dishes)))

number of wikified dishes: 29


In [13]:
# Write files with id as file name and title\ntext as content

path = '/home/mmartinelli/persistent/en-wiki-food'


for ids, article in zip(handler._pages, wikified_dishes):
    ids = ids[1]
    article =article
    file = '{}.txt'.format(ids)
    with open(os.path.join(path, file), 'w') as f:
        f.write('{}'.format(article))

#print(os.listdir(path))