## Usage of python script

In [1]:
import reader

In [2]:
%%time
peano_size = reader.query_size('peano')
print(peano_size)

491
Wall time: 2.11 s


In [3]:
%%time
peano_path = reader.query('peano', batch_size=100, limit=None, is_category=False, debug_info=True)

Dumped 100 pages
Dumped 200 pages
Dumped 300 pages
Dumped 400 pages
Dumped 491 pages
Wall time: 29.7 s


In [4]:
%%time
math_path = reader.query('Mathematics', batch_size=100, limit=1000, is_category=True, debug_info=True)

Dumped 100 pages
Dumped 200 pages
Dumped 300 pages
Dumped 400 pages
Dumped 500 pages
Dumped 600 pages
Dumped 700 pages
Dumped 800 pages
Dumped 900 pages
Dumped 1000 pages
Wall time: 34.1 s


In [6]:
%%time
math_path2 = reader.query('Mathematics', batch_size=100, limit=1000, is_category=True, debug_info=True)
print(math_path == math_path2)

Request has already been downloaded
True
Wall time: 1.02 ms


In [13]:
import json
import pprint

pp = pprint.PrettyPrinter(indent=2)

with open(peano_path / '100.json', 'r', encoding='utf8') as json_file:  
    data = json.load(json_file)
    pp.pprint(data['pages'][3])

{ 'text': 'thumb|right|Giuseppe Peano \n'
          'thumb|right|First recorded usage of the symbol ϵ for set '
          'membership.\n'
          '\n'
          'The 1889 treatise Arithmetices principia, nova methodo exposita '
          '(The principles of arithmetic, presented by a new method; 1889) by '
          'Giuseppe Peano is a seminal document in mathematical logic and set '
          'theory, introducing what is now the standard axiomatization of the '
          'natural numbers, and known as the Peano axioms, as well as some '
          'pervasive notations, such as the symbols for the basic set '
          'operations ∈, ⊂, ∩, ∪, and A−B.\n'
          '\n'
          'The treatise is written in Latin, which was already somewhat '
          'unusual at the time of publication, Latin having fallen out of '
          'favour as the lingua franca of scholarly communications by the end '
          'of the 19th century. The use of Latin in spite of this reflected '
          "P

## Implementation (Debug purposes)

In [2]:
from pathlib import Path
import pywikibot
from pywikibot import pagegenerators
import json
import mwparserfromhell as mwp

def _clean(wiki_text):
    wikicode = mwp.parse(wiki_text)
    return wikicode.strip_code()

def _dump(path, data):
    with open(path, 'w', encoding='utf8') as outfile:  
        json.dump(data, outfile, indent=2, ensure_ascii=False)
        
def query_size(request):
    site = pywikibot.Site()
    pages = list(site.search(request, namespaces=[0]))
    
    return len(pages)

def query(request, batch_size=100, limit=1000, is_category=False, debug_info=True):
    requests_base = Path('../requests')
    requests_path = requests_base / request
    
    if requests_path.exists():
        if debug_info: print('Request has already been downloaded')
        return requests_path

    requests_path.mkdir(parents=True, exist_ok=True)
    
    site = pywikibot.Site()
    
    if is_category:
        category = pywikibot.Category(site, request)
        pages = list(category.articles(namespaces=[0], # type of entities to query, 0 = page
                              recurse=True, # also query all subpages
                              total=limit,
                              content=True)) # preloaod pages
    else:
        pages = list(site.search(request,
                                 total=limit,
                                 content=True, # preloaod pages
                                 namespaces=[0])) # type of entities to query, 0 = page
    
    count = 0
    data = []
    for p in pages:
        count += 1
        data.append({
            'title': p.title(),
            'url': p.full_url(),
            'text': _clean(p.text),
        })
        
        if count % batch_size == 0:
            _dump(requests_path / (str(count) + '.json'), data)
            data = []
            if debug_info: print('Dumped {} pages'.format(count))
            
    if len(data):
        _dump(requests_path / (str(count) + '.json'), data)
        if debug_info: print('Dumped {} pages'.format(count))
            
    return requests_path

In [3]:
%time
query('peano', limit=None)

Wall time: 0 ns
Dumped 100 pages
Dumped 200 pages
Dumped 300 pages
Dumped 400 pages
Dumped 491 pages


WindowsPath('../requests/peano')