In [1]:
import requests
from xml.etree import ElementTree
import time
import re
from collections import defaultdict
from pathlib import Path
import os

In [2]:
def extract_see_also(page_text):
    try:
        lines         = page_text.splitlines()
        see_index     = lines.index('==See also==')
        ref_index     = lines.index('==References==')
        raw_titles    = lines[see_index+2:ref_index-1]
        regex         = '.*\[\[(.*)\]\]'
        parsed_titles = []
        for title in raw_titles:
            re_result = re.search(regex, title)
            if re_result:
                parsed_titles.append(re_result.group(1))
            
        return parsed_titles
    except:
        return []

In [3]:
def generate_page_name_from_title(title):
    return '_'.join(title.split())

In [4]:
def get_wikipedia_page(page_title, delay = 3):
    api_url           = f'https://en.wikipedia.org/wiki/Special:Export/{page_title}'
    req               = requests.get(api_url)
    time.sleep(5)
    page_text         = req.text
    if page_text == '':
        print(page_title, api_url)
    xml_root          = ElementTree.fromstring(page_text)s8 
    page_content      = xml_root\
            .find('{http://www.mediawiki.org/xml/export-0.10/}page')\
            .find('{http://www.mediawiki.org/xml/export-0.10/}revision')\
            .find('{http://www.mediawiki.org/xml/export-0.10/}text')
    page_content_text = page_content.text
    see_also_titles   = extract_see_also(page_content_text)
    see_also_links    = [generate_page_name_from_title(title) for title in see_also_titles] 
    page_dict         = {
        'title'   : page_title,
        'content' : page_content_text,
        'see_also': see_also_links
    }
    
    return page_dict

In [5]:
def mine_graph(entry_points, n = 10):
    queues     = [[point] for point in entry_points]
    downloaded = set()
    i          = 0
    documents  = defaultdict(list)
    
    while len(downloaded) < n:
        print(100 * len(downloaded) / n, '%')
        queue       = queues[i % len(queues)]
        i          += 1
        if not queue:
            continue
        page_title, category  = queue.pop(0)
        if page_title in downloaded:
            print('Already downloaded')
            continue
        downloaded.add(page_title)
        page_dict = get_wikipedia_page(page_title)
        documents[category].append(page_dict)
        new_queue_elems = [(title, category) for title in page_dict['see_also']]
        queue.extend(new_queue_elems)
        if not any(queue for queue in queues):
            print('all queues are empty, exiting.')
            break

    return documents

In [6]:
def save_documents(documents, data_folder = Path('../data/')):
    for category in documents:
        os.makedirs(data_folder / category, exist_ok = True)
        for page in documents[category]:
            title   = page['title']
            content = page['content']
            with open(data_folder / category / title, 'w') as page_file:
                page_file.write(content)

In [7]:
entry_points = [
    ('French_Revolution'                       , 'history'),
    ('Aleppo_offensive_(October–December_2013)', 'history'),
    ('World_War_II'                            , 'history'),
    ('Algebraic_graph_theory'                  , 'math'),
    ('Machine_learning'                        , 'math'),
    ('Game_theory'                             , 'math'),
    ('Astronomy'                               , 'space'),
    ('Universe'                                , 'space'),
    ('Pluto'                                   , 'space'),
    ('Linguistics'                             , 'language'),
    ('Translation'                             , 'language'),
    ('Toki_Pona'                               , 'language'),
    ('Napster'                                 , 'tech'),
    ('Freenet'                                 , 'tech'),
    ('Neuralink'                               , 'tech'),
    ('For_the_World'                           , 'music'),
    ('Pixies'                                  , 'music'),
    ('Jazz'                                    , 'music'),
]

In [8]:
documents = mine_graph(entry_points, 2000)

0.0 %
0.05 %
0.1 %
0.15 %
0.2 %
0.25 %
0.3 %
0.35 %
0.4 %
0.45 %
0.5 %
0.55 %
0.6 %
0.65 %
0.7 %
0.75 %
0.8 %
0.85 %
0.9 %
0.95 %
0.95 %
0.95 %
1.0 %
1.0 %
1.0 %
1.05 %
1.05 %
1.05 %
1.1 %
1.15 %
1.2 %
1.2 %
1.25 %
1.25 %


AttributeError: 'NoneType' object has no attribute 'find'

In [None]:
save_documents(documents)