In [78]:
import xml.sax
import subprocess
import mwparserfromhell
import re
import os

In [79]:
# Function where ContentHandler looks for opening and closing tags title and text 
# and adds characters enclosed within them to the buffer
# content saved to a dict with tag as key

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'id', 'text', 'timestamp'):         #do we need timestamp?
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['id'], self._values['text']))

In [80]:
data_path = r"/home/mmartinelli/project/corpora/wikidumps/itwiki-20210720-pages-articles-multistream.xml.bz2"
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

lst = []
# Iterating through compressed file
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    
    parser.feed(line)
    # Stop after nth article reached
    if len(handler._pages) > 20000:
        break

In [81]:
# Append all articles that have the strings defined in categories in wikified_dishes list 
# the list has tuples with [0] being the title and [1] being the text

wikified_dishes = []
categories = ["Categoria:Antipasti", "Categoria:Contorni", "Categoria:Dolci", "Categoria:Involtini", 
              "Categoria:Piatti unici", "Categoria:Primi piatti", "Categoria:Secondi piatti"]
for x in handler._pages:
    if any(cat in x[2] for cat in categories):
        wikified_dishes.append(x)
    else:
        pass
print(len(wikified_dishes))
        

3


In [82]:
# Create title list and append only titles (element 0 of tuples in wikified_dishes)

title_lst = [el[0] for el in wikified_dishes]

# Create id list and append only ids (el 1 of tuples in wikified_dishes)

id_lst = [el[1] for el in wikified_dishes]

# Create text list and append only texts (element 2 of tuples in wikified_dishes)

text_lst = [el[2] for el in wikified_dishes]

In [83]:
# Parse text list

text_lst = [mwparserfromhell.parse(text) for text in text_lst]

# Clean texts (although no apparent change from mwparserfromhell.parse())

text_lst = [text.strip_code().strip() for text in text_lst]

In [84]:
# Clean texts from 'Note' until the end and other undeleted tags with regex

clean_text_lst = [re.sub(r"(== Note == | ==Note== )\n *(.)*", "", el, flags=re.DOTALL) for el in text_lst]
clean_text_lst = [re.sub(r"( < ref > | < /ref > )", "", el) for el in clean_text_lst]
clean_text_lst = [re.sub(r"<[^>]+>", "", el) for el in clean_text_lst]

In [85]:
# Join id and text

id_text = ['\n'.join(x) for x in zip(id_lst, clean_text_lst)]

# Join title and text

wiki = ['\n'.join(x) for x in zip(title_lst, id_text)]

In [86]:
# DELETE FROM HERE

#path = '/home/jupyter-margherita/corpora/wiki-it-food'
#file_name = "test.txt"
#completeName = os.path.join(save_path, file_name)
#print(completeName)

# DELETE UNTIL HERE

/home/jupyter-margherita/corpora/wiki-it-food/test.txt


In [87]:
path = '/home/jupyter-margherita/corpora/wiki-it-food'

for ids, article in zip(wikified_dishes, wiki):
    ids = ids[1]
    article =article
    file = '{}.txt'.format(ids)
    with open(os.path.join(path, file), 'w') as f:
        f.write('{}'.format(article))

print(os.listdir(save_path))

['453768.txt', '767487.txt', '37401.txt']
