# Download and parse data from Wikipedia

This notebook downloads and parses the data from Wikipedia. The output will be stored as JSONL files, one article per line.

You can download individual Wikipedias or all Wikipedias as specified in the Poio Corpus configuration file.

In [14]:
import pathlib
ROOT_DIR = pathlib.Path().absolute().parent
ROOT_DIR

PosixPath('/home/pbouda/Projects/git-github/poio-corpus')

## Load language list

In [15]:
import json

with open(os.path.join(ROOT_DIR, "config.json"), "r", encoding="utf-8") as f:
    language_map = json.load(f)["LanguagesISOMap"]

In [3]:
languages = language_map.keys()

Or set language list manually for testing purposes:

In [2]:
languages = ["bar"]

## Process languages

First, we define some helper functions.

### Get dump link

In [3]:
import urllib
import re

import requests
from bs4 import BeautifulSoup

def dump_link_from_lang_page(wiki_name, page):
    html_page = requests.get(page)
    soup = BeautifulSoup(html_page.content, features="html.parser")
    all_links = soup('a')
    for l in all_links:
        match = re.match(
            wiki_name + "-(\d{8})-pages-articles.xml.bz2", l.string)
        if match:
            wiki_date = match.group(1)
            dump_link = urllib.parse.urljoin(page, l['href'])
            return wiki_date, dump_link
    return None, None

def get_dump_link(iso_639_1):
    url = "https://dumps.wikimedia.org/backup-index.html"
    wiki_prefix = iso_639_1 + "wiki"
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content)

    page = None
    for link in soup('a'):
        if link.string == wiki_prefix:
            page = urllib.parse.urljoin(url, link['href'])

    # get the link for the dump file
    return dump_link_from_lang_page(wiki_prefix, page)

### Download dump

In [20]:
import os

def download_dump(dump_link, wiki_name, new_wiki_name):
    file_name = dump_link.split('/')[-1]
    download_path = os.path.join(ROOT_DIR, "build", "corpus", new_wiki_name)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    file_path = os.path.join(download_path, file_name)
    if not os.path.exists(file_path):
        r = requests.get(dump_link)
        with open(file_path, "wb") as f:
            f.write(r.content)
    return file_path

### Extract data with WikiExtractor

In [21]:
import sys
import subprocess

def wikipedia_extractor(file_path, new_wiki_name):
    out = None; err = None
    
    proc = subprocess.Popen(
        [
            sys.executable,
            "WikiExtractor.py",
            file_path,
            "--json",
            "-q",
            "-b", "100M",
            "-o", os.path.join(ROOT_DIR, "build", "corpus", new_wiki_name, "extracted")
        ],
        stdout=subprocess.PIPE
    )
    (out, err) = proc.communicate()
    return (out, err)

In [22]:
for iso_639_3 in languages:
    print("Processing {}...".format(iso_639_3))
    iso_639_1 = language_map[iso_639_3]
    wiki_date, dump_link = get_dump_link(iso_639_1)
    in_wiki_prefix = iso_639_1 + "wiki"
    out_wiki_prefix = iso_639_3 + "wiki"
    print("  downloading...")
    file_path = download_dump(dump_link, in_wiki_prefix, out_wiki_prefix)
    print("  extracting...")
    wikipedia_extractor(file_path, out_wiki_prefix)

Processing bar...
  downloading...
  extracting...
