In [37]:
import threading
import functools
import string
import queue
import time
import json

import requests
import bs4


def threadify(func):
    "function runs in a new thread."

    @functools.wraps(func)
    def run(*args, **kwds):
        new_thread = threading.Thread(
            target = func,
            args   = args,
            kwargs = kwds)
        new_thread.start()
        return new_thread

    return run


In [38]:
def get_names(url):
    resp = requests.get(url)
    if resp.status_code != 200:
        print("Error:", resp.text)
        return [], [url] # try again!
    soup = bs4.BeautifulSoup(resp.text, 'html.parser')
    page = soup.find_all(class_="mw-category-group")
    next_url = soup.find("a", text="next page")
    data = [(i.text, i["href"]) for e in page for i in e.find_all("a")]
    leaf = []
    node = [next_url['href']] if next_url else []
    for name, link in data:
        if "category:" in link.lower() and "_stubs" not in link:
            node.append(link)
        elif "category:" not in link.lower():
            leaf.append((name, link))
    return leaf, node


def get_visit_count(links, results=None):

    def get_view_counts(link):
        name = link.replace("/wiki/", "")
        end  = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
        url  = f"en.wikipedia.org/all-access/all-agents/{name}/monthly/20180101/20181231"
        view = json.loads(requests.get(end+url).text)
        if "items" in view:
            return sum(i["views"] for i in view["items"])
        else:
            return None

    @threadify
    def loader(q, results):
        link = q.get()
        error_count = 0
        while q:
            if link in results:
                link = q.get()
                error_count = 0
                continue
            try:
                results[link] = get_view_counts(link)
            except BaseException as e:
                print("Error: retrying", link, e)
                time.sleep(2)
                error_count += 1
            else:
                link = q.get()
                error_count = 0
            if error_count > 30:
                link = q.get()
                error_count = 0


    def main(links, results=None):
        if results is None:
            results = {}
        q = queue.Queue()
        for i in sorted(links):
        	q.put(i)
        threads = []
        for _ in range(10):
            t = loader(q, results)
            threads.append(t)
        return results, threads
            
    return main(links, results)


In [39]:
def load_links():
    endpoint = "https://en.wikipedia.org"
    start = "/wiki/Category:Births_by_century"
    queue = [start]
    data  = []
    seen  = set()
    counter = 0
    while queue:
        link = queue.pop()
        if link in seen:
            continue
        queue.append(link)
        leaf, node = get_names(endpoint+link)
        if len(leaf) >= 1 or len(node) > 1:
            seen.add(link)
        queue.pop()
        queue.extend(node)
        data.extend(leaf)
        counter += 1

        print("\r", counter, len(data), link, end=" "*150)
    return data

In [41]:
data = load_links()

 8905 1405759 /wiki/Category:0s_BC_births                                                                                                                                                                                                                                                                                                                                                                                                                     

In [43]:
with open("names.json", "w") as f:
    json.dump(data, f)

In [44]:
links = [link for _, link in data]

In [45]:
result, threads = get_visit_count(links)

In [None]:

for i in range(500):
    time.sleep(0.5)
    print("\r", len(result), end="")

 3432Error: retrying /wiki/%C3%89tienne_de_Veniard,_Sieur_de_Bourgmont HTTPSConnectionPool(host='wikimedia.org', port=443): Max retries exceeded with url: /api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/%C3%89tienne_de_Veniard,_Sieur_de_Bourgmont/monthly/20180101/20181231 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x11c095128>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
 5535Error: retrying /wiki/%C5%8Cta_Masahiro HTTPSConnectionPool(host='wikimedia.org', port=443): Max retries exceeded with url: /api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/%C5%8Cta_Masahiro/monthly/20180101/20181231 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x111fa84a8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
 6663Error: retrying /wiki/A._Arthur_Guilbert HT