## Usage of python script

In [1]:
import reader

In [2]:
%%time
peano_size = reader.query_size('peano')
print(peano_size)

491
CPU times: user 101 ms, sys: 12.1 ms, total: 113 ms
Wall time: 1.18 s


In [4]:
%%time
peano_path = reader.query('peano', batch_size=100, limit=None,
                          is_category=False, debug_info=True, force_rewrite=True)

Cleaning old data
Dumped 100 pages | peano
Dumped 200 pages | peano
Dumped 300 pages | peano
Dumped 400 pages | peano
Dumped 491 pages | peano
CPU times: user 14.5 s, sys: 141 ms, total: 14.7 s
Wall time: 21.2 s


In [4]:
%%time
math_path = reader.query('Mathematics', batch_size=100, limit=1000, is_category=True, debug_info=True)

Dumped 100 pages
Dumped 200 pages
Dumped 300 pages
Dumped 400 pages
Dumped 500 pages
Dumped 600 pages
Dumped 700 pages
Dumped 800 pages
Dumped 900 pages
Dumped 1000 pages
Wall time: 34.1 s


In [6]:
%%time
math_path2 = reader.query('Mathematics', batch_size=100, limit=1000, is_category=True, debug_info=True)
print(math_path == math_path2)

Request has already been downloaded
True
Wall time: 1.02 ms


In [13]:
import json
import pprint

pp = pprint.PrettyPrinter(indent=2)

with open(peano_path / '100.json', 'r', encoding='utf8') as json_file:  
    data = json.load(json_file)
    pp.pprint(data['pages'][3])

{ 'text': 'thumb|right|Giuseppe Peano \n'
          'thumb|right|First recorded usage of the symbol ϵ for set '
          'membership.\n'
          '\n'
          'The 1889 treatise Arithmetices principia, nova methodo exposita '
          '(The principles of arithmetic, presented by a new method; 1889) by '
          'Giuseppe Peano is a seminal document in mathematical logic and set '
          'theory, introducing what is now the standard axiomatization of the '
          'natural numbers, and known as the Peano axioms, as well as some '
          'pervasive notations, such as the symbols for the basic set '
          'operations ∈, ⊂, ∩, ∪, and A−B.\n'
          '\n'
          'The treatise is written in Latin, which was already somewhat '
          'unusual at the time of publication, Latin having fallen out of '
          'favour as the lingua franca of scholarly communications by the end '
          'of the 19th century. The use of Latin in spite of this reflected '
          "P

## Implementation (Debug purposes)

In [7]:
from pathlib import Path
import pywikibot
from pywikibot import pagegenerators
import json
import mwparserfromhell as mwp

def _clean(wiki_text):
    wikicode = mwp.parse(wiki_text)
    return wikicode.strip_code()

def _dump(path, data):
    with open(path, 'w', encoding='utf8') as outfile:  
        outfile.write("\n".join(data))
#         json.dump(data, outfile, indent=2, ensure_ascii=False)
        
def query_size(request):
    site = pywikibot.Site()
    pages = list(site.search(request, namespaces=[0]))
    
    return len(pages)

def get_requests_path(request):
    requests_base = Path('../requests')
    requests_path = requests_base / request
    
    is_exist = requests_path.exists()
    if not is_exist:
        requests_path.mkdir(parents=True)
      
    return (requests_path, is_exist)

def query(request, batch_size=100, limit=1000, is_category=False,
          preload_content=True, force_rewrite=True, debug_info=True):
    requests_path, existed = get_requests_path(request)
    
    if existed:
        if not force_rewrite:
            if debug_info: print('Request has already been downloaded')
            return requests_path
        else:
            if debug_info: print('Cleaning old data')
            for x in requests_path.iterdir():
                x.unlink()
    
    site = pywikibot.Site()    
    if is_category:
        category = pywikibot.Category(site, request)
        pages = list(category.articles(namespaces=[0], # type of entities to query, 0 = page
                              recurse=True, # also query all subpages
                              total=limit,
                              content=preload_content)) # preloaod pages
    else:
        pages = list(site.search(request,
                                 total=limit,
                                 content=preload_content, # preloaod pages
                                 namespaces=[0])) # type of entities to query, 0 = page
    
    count = 0
    data = []
    for p in pages:
        count += 1
        data.append(json.dumps({
            "title": p.title(),
            "url": p.full_url(),
            "text": _clean(p.text),
        }))
        
        if count % batch_size == 0:
            _dump(requests_path / (str(count) + '.json'), data)
            data = []
            if debug_info: print('Dumped {} pages | {}'.format(count, request))
            
    if len(data):
        _dump(requests_path / (str(count) + '.json'), data)
        if debug_info: print('Dumped {} pages | {}'.format(count, request))
            
    return requests_path

In [2]:
from concurrent.futures import ThreadPoolExecutor

wrapper = lambda r,b,l,cat,cont: query(r,batch_size=b,limit=l,is_category=cat,preload_content=cont)
batch_size = 100
limit = 500
preload_content = True
with ThreadPoolExecutor(max_workers=2) as e:
    e.submit(wrapper, 'peano', batch_size, limit, False, preload_content)
    e.submit(wrapper, 'Mathematics', batch_size, limit, True, preload_content)

Dumped 100 pages | Mathematics
Dumped 100 pages | peano
Dumped 200 pages | Mathematics
Dumped 300 pages | Mathematics
Dumped 400 pages | Mathematics
Dumped 500 pages | Mathematics
Dumped 200 pages | peano
Dumped 300 pages | peano
Dumped 400 pages | peano
Dumped 491 pages | peano


In [8]:
%%time

query('peano', batch_size=100, limit=None, is_category=False, preload_content=True)

Cleaning old data
Dumped 100 pages | peano
Dumped 200 pages | peano
Dumped 300 pages | peano
Dumped 400 pages | peano
Dumped 491 pages | peano
CPU times: user 13.9 s, sys: 141 ms, total: 14.1 s
Wall time: 25.1 s


PosixPath('../requests/peano')

In [8]:
import json

json.dumps({
            "title": 23232,
            "url": 32,
            "text": 132,
        })

'{"title": 23232, "url": 32, "text": 132}'