# 00 – Getting the data

“Take N classes of wikipedia articles, for each class – 1000 articles.”

In [6]:
import wikipedia as wp
import pandas as pd

from pathlib import Path
from random import sample

import helpers # some general functions for loading, writting and plotting stuff

In [8]:
DATA_SIZE = 1000

## Fetching the pages

In this part we fetch the pages from wikipedia with there text and title.

In [5]:
# We did not want to go to complicated here, just download pages that respond to a query and some synonyms.
queries = [
    ('Math', ['Math', 'Algebra', 'Arithmetic', 'Mathematics']),
    ('Politics', ['Politics', 'Government', 'Legislation', 'Polity']),
    ('Oceania', ['Oceania', 'New Zealand', 'Australia', 'Oceanic']),
    ('Linguistics', ['Linguistics', 'Language', 'Grammar', 'Syntax']),
    ('Prehistory', ['Prehistory', 'Paleology', 'Paleontology']),
    ('Religion', ['Religion', 'Christianism', 'Islamism']),
    ('Economics', ['Economics', 'Finance', 'Business']),
    ('Literature', ['Literature', 'Poetry', 'Novel']),
]

In [9]:
def robust_fetch(page): 
    '''Download a page, ignore errors.'''
    try:
        p = wp.page(page)
    except:
        # Sometimes wp.page() can raise exception on disambiambiguation pages or others.
        # We do not care about them.
        p = None
    return p

def query_to_pages(query, size=DATA_SIZE):
    '''Return the titles of pages returned by a query.'''
    pages = set()
    pages.update(wp.search(query, results=size))
    return pages

def multiquery_to_pages(queries, size=DATA_SIZE):
    '''Uses multiple queries to build a page list.'''
    p = set()
    for q in queries:
        if len(p) < size:        
            p |= query_to_pages(q, size)
    return p

def dataset_for_query(queries, size=DATA_SIZE):
    '''Given the pages returned by the queries, fetches the pages.'''
    pages = multiquery_to_pages(queries, size)
    
    p = []
    for page in pages:
        try:
            t = robust_fetch(page)
            p.append((t.title, t.content))
        except:
            pass
    return p if len(p) < size else sample(p, size)
        
def datasets_for_queries(queries, size=DATA_SIZE):
    '''Build a dataset given multiple categories and there associated queries.'''
    return [(name, dataset_for_query(q, size)) for name, q in queries]

In [11]:
%%time
# Pretty slow process, just let it run overnight. 
datasets = datasets_for_queries(queries)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Wall time: 5h 43min 15s


## Preparing dataframe

We want to make the dataset clean and ready to be used by our auto-encoder.

In [12]:
full_dataset = pd.DataFrame()

for category in datasets:
    df = pd.DataFrame(category[1])
    df['2'] = category[0]
    full_dataset = pd.concat([full_dataset, df])
        
full_dataset = full_dataset.reset_index(drop=True)
full_dataset.columns = ['title', 'content', 'label']

In [16]:
full_dataset.sample(4)

Unnamed: 0,title,content,label
2967,Oceania Women's Handball Champions Cup,The Oceania Handball Champions Cup is an inter...,Oceania
1265,Political party,A political party is a group of people who com...,Politics
4599,Koparion,Koparion is a genus of small coelurosaurian th...,Prehistory
591,MPIR (mathematics software),Multiple Precision Integers and Rationals (MPI...,Math


In [4]:
# Save the data in a pickle file for future usage.
helpers.write_pickle(full_dataset, 'full_dataset')