In [3]:
import requests
from bs4 import BeautifulSoup
import os
import time
from keras.utils import get_file
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax

import subprocess
import mwparserfromhell
import json

Using TensorFlow backend.


In [4]:
index = requests.get('https://dumps.wikimedia.org/enwiki/').text

In [5]:
soup_index = BeautifulSoup(index, 'html.parser')

In [7]:
soup_index.contents

[<html>
 <head><title>Index of /enwiki/</title></head>
 <body bgcolor="white">
 <h1>Index of /enwiki/</h1><hr/><pre><a href="../">../</a>
 <a href="20180601/">20180601/</a>                                          21-Jul-2018 01:33                   -
 <a href="20180620/">20180620/</a>                                          02-Aug-2018 01:28                   -
 <a href="20180701/">20180701/</a>                                          22-Aug-2018 01:25                   -
 <a href="20180720/">20180720/</a>                                          02-Sep-2018 01:27                   -
 <a href="20180801/">20180801/</a>                                          11-Aug-2018 08:29                   -
 <a href="20180820/">20180820/</a>                                          23-Aug-2018 15:32                   -
 <a href="20180901/">20180901/</a>                                          12-Sep-2018 12:50                   -
 <a href="latest/">latest/</a>                                  

In [8]:
dumps = [a['href'] for a in soup_index.find_all('a') 
             if a.has_attr('href') and a.text[:-1].isdigit()]
dumps

['20180601/',
 '20180620/',
 '20180701/',
 '20180720/',
 '20180801/',
 '20180820/',
 '20180901/']

In [9]:
for dump_url in sorted(dumps, reverse=True):
    print(dump_url)
    dump_html = index = requests.get('https://dumps.wikimedia.org/enwiki/' + dump_url).text
    soup_dump = BeautifulSoup(dump_html, 'html.parser')
    pages_xml = [a['href'] for a in soup_dump.find_all('a') 
                 if a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]
    if pages_xml:
        break
    time.sleep(0.8)

20180901/


In [12]:
soup_dump.text

"\n\n\n\nenwiki dump progress on 20180901\n\n\n                .siteinfo {\n                        text-align: center;\n                }\n                li {\n                        list-style-type: none;\n                        padding: 0.5em 1.5em 0.5em 1.5em;\n                        background: #fff;\n                        margin-bottom: 1em;\n                }\n                li li {\n                        background-color: white;\n                        box-shadow: none;\n                        border-top: none;\n                        padding: 0px;\n                        margin-bottom: 0em;\n                }\n                li ul {\n                        margin-top: 4px;\n                        margin-bottom: 8px;\n                        box-shadow: none;\n                        border-top: none;\n                        padding: 0.5em 0px 0px;\n                }\n                .detail {\n                        font-weight: normal;\n                     

In [13]:
soup_dump.contents

['html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"',
 '\n',
 <html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
 <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
 <title>enwiki dump progress on 20180901</title>
 <link href="/dumps.css" rel="stylesheet" type="text/css"/>
 <style type="text/css">
                 .siteinfo {
                         text-align: center;
                 }
                 li {
                         list-style-type: none;
                         padding: 0.5em 1.5em 0.5em 1.5em;
                         background: #fff;
                         margin-bottom: 1em;
                 }
                 li li {
                         background-color: white;
                         box-shadow: none;
                         border-top: none;
                         padding: 0px;
                         margin-bottom: 0em;
                 }
       

In [14]:
pages_xml[0]

'/enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2'

In [15]:
wikipedia_dump = pages_xml[0].rsplit('/')[-1]
url = url = 'https://dumps.wikimedia.org/' + pages_xml[0] 
path = get_file(wikipedia_dump, url)
path

Downloading data from https://dumps.wikimedia.org//enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2


'/home/ubuntu/.keras/datasets/enwiki-20180901-pages-articles.xml.bz2'

In [102]:
import re

def process_article(title, text):
    rotten = [(re.findall('\d\d?\d?%', p), re.findall('\d\.\d\/\d+|$', p), p.lower().find('rotten tomatoes')) for p in text.split('\n\n')]
    rating = next(((perc[0], rating[0]) for perc, rating, idx in rotten if len(perc) == 1 and idx > -1), (None, None))
    wikicode = mwparserfromhell.parse(text)
    # return wikicode
    film = next((template for template in wikicode.filter_templates() 
                 if template.name.strip().lower() in ['infobox film']), None)
    if film:
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in film.params
                      if param.value.strip_code().strip()
                     }
        links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        return (title, properties, links) + rating

In [75]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._movies = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            movie = process_article(**self._values)
            if movie:
                self._movies.append(movie)

In [76]:
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)
for i, line in enumerate(subprocess.Popen(['bzcat'], stdin=open(path), stdout=subprocess.PIPE).stdout):
    x = line
    try:
        parser.feed(line)
    except StopIteration:
        break
    if handler._movies:
        break

In [78]:
handler._movies

[('Actrius',
  {'name': 'Actresses',
   'image': 'Actrius film poster.jpg',
   'caption': 'Catalan language film poster',
   'film name': '(Catalan: Actrius)',
   'director': 'Ventura Pons',
   'producer': 'Ventura Pons',
   'writer': 'Josep Maria Benet i Jornet',
   'screenplay': 'Ventura Pons',
   'narrator': '< !-- or:',
   'narrators': '-- >',
   'music': 'Carles Cases',
   'cinematography': 'Tomàs Pladevall',
   'editing': 'Pere Abadal',
   'distributor': 'Buena Vista International',
   'runtime': '100 minutes',
   'country': 'Spain',
   'language': 'Catalan',
   'gross': '< !--(please use condensed and rounded values, e.g.  " £11.6 million "  not  " £11,586,221 " )-- >'},
  ['Catalan language',
   'Ventura Pons',
   'Josep Maria Benet i Jornet',
   'Núria Espert',
   'Rosa Maria Sardà',
   'Anna Lizaran',
   'Mercè Pons',
   'Canal+',
   'Generalitat de Catalunya',
   'Televisión Española',
   'Buena Vista International',
   'Spain',
   'Catalan language',
   'Catalan language',


In [101]:
process.filter_wikilinks()[0]

'[[Catalan language|Catalan]]'

In [98]:
process.filter_wikilinks()[0].title

'Catalan language'

In [99]:
x = process.filter_wikilinks()[0]

In [100]:
x.text

'Catalan'

In [94]:
process.filter_wikilinks()[1].title

'Ventura Pons'

In [80]:
process = process_article(**handler._values)
process.filter_templates()

['{{Use dmy dates|date=June 2013}}',
 '{{Infobox film \n | name           = Actresses \n | image          = Actrius film poster.jpg \n | alt            =  \n | caption        = Catalan language film poster \n | film name      = ([[Catalan language|Catalan]]: \'\'\'\'\'Actrius\'\'\'\'\') \n | director       = [[Ventura Pons]] \n | producer       = Ventura Pons \n | writer         = [[Josep Maria Benet i Jornet]] \n | screenplay     = Ventura Pons \n | story          =  \n | based on       = {{based on|(stage play) \'\'E.R.\'\'|Josep Maria Benet i Jornet}} \n | starring       = {{ubl|[[Núria Espert]]|[[Rosa Maria Sardà]]|[[Anna Lizaran]]|[[Mercè Pons]]}} \n | narrator       =  < !-- or: |narrators = -- > \n | music          = Carles Cases \n | cinematography = Tomàs Pladevall \n | editing        = Pere Abadal \n | production companies = {{ubl|[[Canal+|Canal+ España]]|Els Films de la Rambla S.A.|[[Generalitat de Catalunya|Generalitat de Catalunya - Departament de Cultura]]|[[Televisión Es

In [83]:
process.filter_templates()[1].name.strip().lower()

'infobox film'

In [85]:
template = process.filter_templates()[1]

In [90]:
template.params[1].name.strip_code().strip()

'image'

In [91]:
template.params[1].value.strip_code().strip()

'Actrius film poster.jpg'

In [60]:
handler._values

{'title': 'Actrius',
 'text': '{{Use dmy dates|date=June 2013}} \n {{Infobox film \n | name           = Actresses \n | image          = Actrius film poster.jpg \n | alt            =  \n | caption        = Catalan language film poster \n | film name      = ([[Catalan language|Catalan]]: \'\'\'\'\'Actrius\'\'\'\'\') \n | director       = [[Ventura Pons]] \n | producer       = Ventura Pons \n | writer         = [[Josep Maria Benet i Jornet]] \n | screenplay     = Ventura Pons \n | story          =  \n | based on       = {{based on|(stage play) \'\'E.R.\'\'|Josep Maria Benet i Jornet}} \n | starring       = {{ubl|[[Núria Espert]]|[[Rosa Maria Sardà]]|[[Anna Lizaran]]|[[Mercè Pons]]}} \n | narrator       =  < !-- or: |narrators = -- > \n | music          = Carles Cases \n | cinematography = Tomàs Pladevall \n | editing        = Pere Abadal \n | production companies = {{ubl|[[Canal+|Canal+ España]]|Els Films de la Rambla S.A.|[[Generalitat de Catalunya|Generalitat de Catalunya - Departament 

In [44]:
handler._curent_tag

'title'

In [105]:
parser.feed(line)

SAXParseException: <unknown>:34227622:9: parsing finished

In [106]:
handler._curent_tag

'text'

In [107]:
len(handler._movies)

1647

In [108]:
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)

In [None]:
from timeit import default_timer as timer

start = timer()
reported_length = 0

for i, line in enumerate(subprocess.Popen(['bzcat'], stdin=open(path), stdout=subprocess.PIPE).stdout):
    try:
        parser.feed(line)
    except StopIteration:
        break    
    
    if ((len(handler._movies) + 1) % 10 == 0) and (len(handler._movies) + 1 != reported_length):
        
        print(f'Found {len(handler._movies) + 1} so far. {round(timer() - start)} seconds elapsed.')
        
        # Prevent multiple reportings of the same length
        reported_length = len(handler._movies) + 1

Found 10 so far. 0 seconds elapsed.
Found 20 so far. 121 seconds elapsed.
Found 30 so far. 233 seconds elapsed.
Found 40 so far. 315 seconds elapsed.
Found 50 so far. 341 seconds elapsed.
Found 60 so far. 508 seconds elapsed.
Found 70 so far. 642 seconds elapsed.
Found 80 so far. 811 seconds elapsed.
Found 90 so far. 912 seconds elapsed.
Found 100 so far. 937 seconds elapsed.
Found 110 so far. 956 seconds elapsed.
Found 120 so far. 1025 seconds elapsed.
Found 130 so far. 1158 seconds elapsed.
Found 140 so far. 1210 seconds elapsed.
Found 150 so far. 1222 seconds elapsed.
Found 160 so far. 1256 seconds elapsed.
Found 170 so far. 1279 seconds elapsed.
Found 180 so far. 1366 seconds elapsed.
Found 190 so far. 1416 seconds elapsed.
Found 200 so far. 1469 seconds elapsed.
Found 210 so far. 1540 seconds elapsed.
Found 220 so far. 1604 seconds elapsed.
Found 230 so far. 1622 seconds elapsed.
Found 240 so far. 1626 seconds elapsed.
Found 250 so far. 1627 seconds elapsed.
Found 260 so far. 1631

Found 2040 so far. 6330 seconds elapsed.
Found 2050 so far. 6373 seconds elapsed.
Found 2060 so far. 6398 seconds elapsed.
Found 2070 so far. 6434 seconds elapsed.
Found 2080 so far. 6454 seconds elapsed.
Found 2090 so far. 6490 seconds elapsed.
Found 2100 so far. 6530 seconds elapsed.
Found 2110 so far. 6554 seconds elapsed.
Found 2120 so far. 6587 seconds elapsed.
Found 2130 so far. 6639 seconds elapsed.
Found 2140 so far. 6659 seconds elapsed.
Found 2150 so far. 6692 seconds elapsed.
Found 2160 so far. 6716 seconds elapsed.
Found 2170 so far. 6736 seconds elapsed.
Found 2180 so far. 6762 seconds elapsed.
Found 2190 so far. 6778 seconds elapsed.
Found 2200 so far. 6799 seconds elapsed.
Found 2210 so far. 6816 seconds elapsed.
Found 2220 so far. 6849 seconds elapsed.
Found 2230 so far. 6864 seconds elapsed.
Found 2240 so far. 6891 seconds elapsed.
Found 2250 so far. 6920 seconds elapsed.
Found 2260 so far. 6922 seconds elapsed.
Found 2270 so far. 6939 seconds elapsed.
Found 2280 so fa

Found 4040 so far. 10257 seconds elapsed.
Found 4050 so far. 10263 seconds elapsed.
Found 4060 so far. 10280 seconds elapsed.
Found 4070 so far. 10292 seconds elapsed.
Found 4080 so far. 10316 seconds elapsed.
Found 4090 so far. 10337 seconds elapsed.
Found 4100 so far. 10343 seconds elapsed.
Found 4110 so far. 10366 seconds elapsed.
Found 4120 so far. 10391 seconds elapsed.
Found 4130 so far. 10417 seconds elapsed.
Found 4140 so far. 10427 seconds elapsed.
Found 4150 so far. 10442 seconds elapsed.
Found 4160 so far. 10456 seconds elapsed.
Found 4170 so far. 10471 seconds elapsed.
Found 4180 so far. 10486 seconds elapsed.
Found 4190 so far. 10500 seconds elapsed.
Found 4200 so far. 10522 seconds elapsed.
Found 4210 so far. 10546 seconds elapsed.
Found 4220 so far. 10554 seconds elapsed.
Found 4230 so far. 10569 seconds elapsed.
Found 4240 so far. 10578 seconds elapsed.
Found 4250 so far. 10581 seconds elapsed.
Found 4260 so far. 10591 seconds elapsed.
Found 4270 so far. 10614 seconds e

Found 6000 so far. 13030 seconds elapsed.
Found 6010 so far. 13047 seconds elapsed.
Found 6020 so far. 13063 seconds elapsed.
Found 6030 so far. 13073 seconds elapsed.
Found 6040 so far. 13086 seconds elapsed.
Found 6050 so far. 13104 seconds elapsed.
Found 6060 so far. 13129 seconds elapsed.
Found 6070 so far. 13143 seconds elapsed.
Found 6080 so far. 13159 seconds elapsed.
Found 6090 so far. 13168 seconds elapsed.
Found 6100 so far. 13181 seconds elapsed.
Found 6110 so far. 13196 seconds elapsed.
Found 6120 so far. 13204 seconds elapsed.
Found 6130 so far. 13215 seconds elapsed.
Found 6140 so far. 13224 seconds elapsed.
Found 6150 so far. 13238 seconds elapsed.
Found 6160 so far. 13256 seconds elapsed.
Found 6170 so far. 13267 seconds elapsed.
Found 6180 so far. 13286 seconds elapsed.
Found 6190 so far. 13301 seconds elapsed.
Found 6200 so far. 13313 seconds elapsed.
Found 6210 so far. 13318 seconds elapsed.
Found 6220 so far. 13328 seconds elapsed.
Found 6230 so far. 13341 seconds e

Found 7960 so far. 15220 seconds elapsed.
Found 7970 so far. 15240 seconds elapsed.
Found 7980 so far. 15249 seconds elapsed.
Found 7990 so far. 15257 seconds elapsed.
Found 8000 so far. 15270 seconds elapsed.
Found 8010 so far. 15283 seconds elapsed.
Found 8020 so far. 15307 seconds elapsed.
Found 8030 so far. 15326 seconds elapsed.
Found 8040 so far. 15330 seconds elapsed.
Found 8050 so far. 15344 seconds elapsed.
Found 8060 so far. 15357 seconds elapsed.
Found 8070 so far. 15365 seconds elapsed.
Found 8080 so far. 15381 seconds elapsed.
Found 8090 so far. 15395 seconds elapsed.
Found 8100 so far. 15408 seconds elapsed.
Found 8110 so far. 15415 seconds elapsed.
Found 8120 so far. 15422 seconds elapsed.
Found 8130 so far. 15429 seconds elapsed.
Found 8140 so far. 15443 seconds elapsed.
Found 8150 so far. 15449 seconds elapsed.
Found 8160 so far. 15472 seconds elapsed.
Found 8170 so far. 15488 seconds elapsed.
Found 8180 so far. 15503 seconds elapsed.
Found 8190 so far. 15515 seconds e

Found 9920 so far. 17445 seconds elapsed.
Found 9930 so far. 17457 seconds elapsed.
Found 9940 so far. 17479 seconds elapsed.
Found 9950 so far. 17502 seconds elapsed.
Found 9960 so far. 17514 seconds elapsed.
Found 9970 so far. 17518 seconds elapsed.
Found 9980 so far. 17524 seconds elapsed.
Found 9990 so far. 17532 seconds elapsed.
Found 10000 so far. 17545 seconds elapsed.
Found 10010 so far. 17556 seconds elapsed.
Found 10020 so far. 17573 seconds elapsed.
Found 10030 so far. 17587 seconds elapsed.
Found 10040 so far. 17606 seconds elapsed.
Found 10050 so far. 17627 seconds elapsed.
Found 10060 so far. 17641 seconds elapsed.
Found 10070 so far. 17651 seconds elapsed.
Found 10080 so far. 17662 seconds elapsed.
Found 10090 so far. 17674 seconds elapsed.
Found 10100 so far. 17683 seconds elapsed.
Found 10110 so far. 17705 seconds elapsed.
Found 10120 so far. 17713 seconds elapsed.
Found 10130 so far. 17724 seconds elapsed.
Found 10140 so far. 17737 seconds elapsed.
Found 10150 so far.

In [None]:
with open('generated/wp_movies.ndjson', 'wt') as fout:
    for movie in handler._movies:
         fout.write(json.dumps(movie) + '\n')