In [1]:
import json

data_path = '/home/ubuntu/.keras/datasets/'

import os

files = [data_path + f for f in os.listdir(data_path) if 'xml-p' in f]
len(files)

55

In [2]:
import xml.sax
import mwparserfromhell

In [23]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self, inspect):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._books = []
        self._article_count = 0
        self._non_matches = []
        self._inspect = inspect
        
    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is a book
            book = process_article(**self._values, template = 'Infobox book', inspect = self._inspect)
            # Append to the list of books
            if book:
                self._books.append(book)

In [76]:
def process_article(title, text, timestamp, template = 'Infobox book', inspect = False):
    """Process a wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)
    
    # Filter out errant matches
    matches = [x for x in matches if x.name.strip_code().strip().lower() == template.lower()]
    
    if len(matches) >= 1:
        
        if 'Synopsis' in wikicode or 'Plot' in wikicode or 'Plot summary' in wikicode:
        
            # template_name = matches[0].name.strip_code().strip()

            # Extract information from infobox
            properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                          for param in matches[0].params
                          if param.value.strip_code().strip()}

            # Extract internal wikilinks
            wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]

            # Extract external links
            exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]

            # Find approximate length of article
            text_length = len(wikicode.strip_code().strip())

            if inspect:
                return (title, properties, wikilinks, exlinks, timestamp, text_length, text, wikicode)

            return (title, properties, wikilinks, exlinks, timestamp, text_length)

In [77]:
data_path = files[0]

In [78]:
import subprocess 

# Want to inspect wikicode
handler = WikiXmlHandler(True)
parser = xml.sax.make_parser()

parser.setContentHandler(handler)

lines = []

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    if len(handler._books) > 3:
        break

In [75]:
'Synopsis' in wikicode

True

In [90]:
wikicode.get_ancestors()

TypeError: get_ancestors() missing 1 required positional argument: 'obj'

In [82]:
handler._books[0][-2]

'{{Infobox book \n | name          = Summa Iniuria: Ein Pitaval der Justizirrtümer \n | image         =  \n | author        = Hans M. Sutermeister \n | country       = [[Switzerland]] \n | language      = German \n | subject       = [[Miscarriage of justice]] \n | genre         = non fiction \n | release_date  = 1976 \n | media_type    = Print (Paperback) \n | pages         = 810 \n | isbn          = 3226000969 \n }} \n \'\'\'\'\'Summa Iniuria: Ein Pitaval der Justizirrtümer\'\'\'\'\' (\'\'\'\'\'Summa Iniuria: A Pitaval of Miscarriages of Justice\'\'\'\'\') is a collection of \'\'[[Cause célèbre|causes célèbres]]\'\' by the Swiss author Hans M. Sutermeister.  It is considered “one of the most detailed documentations about [[miscarriage of justice|miscarriages of justice]] in the German language”. < ref > Gilliéron, G. \'\'Strafbefehlsverfahren und plea bargaining als Quelle von Fehlurteilen.\'\' Zurich: Schulthess, 2010, p. 15. {{ISBN|978-3-7255-6021-9}} < /ref >   It is inspired by [[

In [81]:
handler._books[1][-2]

'{{Multiple issues| \n {{notability|Books|date=April 2012}} \n {{all plot|date=July 2012}} \n }} \n \n {{Infobox book \n |author = [[Jane Lindskold]] \n |publisher = Tom Doherty Associates}} \n \'\'\'\'\'The Buried Pyramid\'\'\'\'\' is a book written in 2004 by [[Jane Lindskold]] and published by [[Tom Doherty Associates]]. \n \n ==Plot== \n A British soldier named Captain Neville Hawthorne is order by his commanding officer, Colonel Reginald Sedgewick to escort [[Albert, Prince Consort|Prince Albert]]\'s cousin, Alphonse  " Herr "  Libermann, a German archaeologist. Which Alphone tells Neville that he\'s searching for the Buried Pyramid, the Tomb of Neferankhotep, who may also have been [[Moses]] the Lawgiver and that a lady gave him the journal of an explorer named Chad Spice. \n \n A soldier named Sergeant Edward  " Eddie "  Bryce joins them on their search for the Buried Pyramid along with Alphonse\'s assistant, Derek, and three camel wranglers named Ali, his son, Ishmael, and his 

In [79]:
handler._books[2][-2]

"{{Infobox book \n | name              = Fear the Darkness \n | image             =  \n | caption           =  \n | alt               =  \n | author            = [[Mitchel Scanlon]] \n | cover_artist      =  \n | series            = ''[[Judge Anderson]]'' \n | subject           = [[Mega-City_One#History|Set in the year]] 2128 \n | published         = February 2006 \n | publisher         = [[Black Flame]] \n | pages             = 256 \n | isbn              = 1844163261 \n | preceded_by       = None \n | followed_by       = [[Red Shadows (Judge Anderson novel)|Red Shadows]] \n }} \n '''''Fear the Darkness''''' is an original [[science fiction]] novel written by Mitchel Scanlon and based on the British [[Comic book|comic strip]] ''[[Judge Anderson|Anderson:Psi Division]]'' (a spin-off from ''[[Judge Dredd]]'') in ''[[2000 AD (comics)|2000 AD]]''. It is Scanlon's first ''Anderson'' novel. \n \n ==Synopsis== \n Psychic Cassandra Anderson investigates a series of mysterious deaths of prisone

In [80]:
handler._books[3][-2]

"{{Infobox book \n | name              = Red Shadows \n | image             =  \n | caption           =  \n | alt               =  \n | author            = [[Mitchel Scanlon]] \n | cover_artist      =  \n | series            = ''[[Judge Anderson]]'' \n | subject           = [[Mega-City_One#History|Set in the year]] 2128 \n | published         = May 2006 \n | publisher         = [[Black Flame]] \n | pages             = 252 \n | isbn              = 1844163776 \n | preceded_by       = [[Fear the Darkness]] \n | followed_by       = [[Sins of the Father (Judge Anderson novel)|Sins of the Father]] \n }} \n '''''Red Shadows''''' is an original [[science fiction]] novel written by Mitchel Scanlon and based on the British [[Comic book|comic strip]] ''[[Judge Anderson|Anderson:Psi Division]]'' (a spin-off from ''[[Judge Dredd]]'') in ''[[2000 AD (comics)|2000 AD]]''. It is Scanlon's second ''Anderson'' novel. \n \n ==Synopsis== \n Anderson pursues a vicious serial killer, only to become his next

In [56]:
wikicode = handler._books[3][-1]

In [63]:
import re

re.findall(pattern = '==Synopsis[.*]==', string = str(wikicode))

[]

In [54]:
str(wikicode)

'{{Multiple issues| \n {{notability|Books|date=April 2012}} \n {{all plot|date=July 2012}} \n }} \n \n {{Infobox book \n |author = [[Jane Lindskold]] \n |publisher = Tom Doherty Associates}} \n \'\'\'\'\'The Buried Pyramid\'\'\'\'\' is a book written in 2004 by [[Jane Lindskold]] and published by [[Tom Doherty Associates]]. \n \n ==Plot== \n A British soldier named Captain Neville Hawthorne is order by his commanding officer, Colonel Reginald Sedgewick to escort [[Albert, Prince Consort|Prince Albert]]\'s cousin, Alphonse  " Herr "  Libermann, a German archaeologist. Which Alphone tells Neville that he\'s searching for the Buried Pyramid, the Tomb of Neferankhotep, who may also have been [[Moses]] the Lawgiver and that a lady gave him the journal of an explorer named Chad Spice. \n \n A soldier named Sergeant Edward  " Eddie "  Bryce joins them on their search for the Buried Pyramid along with Alphonse\'s assistant, Derek, and three camel wranglers named Ali, his son, Ishmael, and his 

In [50]:
wikicode.filter_text(matches = '==Plot==')

['. \n \n ==Plot== \n A British soldier named Captain Neville Hawthorne is order by his commanding officer, Colonel Reginald Sedgewick to escort ']

In [29]:
handler._inspect

True

In [None]:
lines[:100]

In [None]:
line.decode('utf-8')

In [None]:
!gunzip --help