In [0]:
import threading
import functools
import string

import requests
import json
import bs4

In [0]:
def threadify(func):
    "function runs in a new thread."

    @functools.wraps(func)
    def run(*args, **kwds):
        new_thread = threading.Thread(
            target = func,
            args   = args,
            kwargs = kwds)
        new_thread.start()
        return new_thread

    return run

In [0]:
def get_names(url):
    resp = requests.get(url)
    if resp.status_code != 200:
        print("Error:", resp.text)
        return [], url # try again!
    soup = bs4.BeautifulSoup(resp.text, 'html.parser')
    page = soup.find_all(class_="mw-category-group")
    data = [(i.text, i["href"]) for e in page for i in e.find_all("a")]
    next_url = soup.find("a", text="next page")
    return data, next_url['href'] if next_url else None

In [0]:
@threadify
def collect(url, collector):
    out = []
    try:
        while url:
            out, url = get_names('https://en.wikipedia.org' + url)
            if out[0] in collector:
                break
            collector.update(out)
        print("breakpoint reached!")
    except:
        # if there's an error just collect(url, collector)
        # again once there's space for another thread
        print("error when collecting:", url)

In [0]:
LIVING_PEOPLE_MAIN_PAGE = "/wiki/Category:Living_people"
results = set()
pages = [LIVING_PEOPLE_MAIN_PAGE]
for c in string.ascii_uppercase:
    pages.append(f"/wiki/Category:Living_people?from={c}")

In [0]:
threads = []
for p in pages:    
    threads.append(collect(p, results))

In [0]:
import time
for i in range(500):
    print("\r", i, len(results), end=" ")
    time.sleep(0.5)

In [0]:
with open("names.json", "w") as f:
    json.dump(sorted(results), f)

In [0]:
import os