## Signpost Article Views

I've wanted to run some basic statistics on *Wikipedia Signpost* article views for a while now, to figure out what people like or don't like reading about. Now I'm finally getting to sit down and do it...

In [147]:
from pageviews import PageviewsClient
import arrow
import datetime
import urllib
from pandas import DataFrame
import pandas as pd
import mwapi


def viewcounts(article_name, start=None, end=None):
    """
    Fetches the viewcounts.
    """
    article_name = article_name.replace(' ', '_')
    parsed_article_name = urllib.parse.quote(article_name).replace('/', '%2F')
    p = PageviewsClient().article_views("en.wikipedia",
                                        [parsed_article_name],
                                        access="all-access",
                                        granularity="daily",
                                        start=start,
                                        end=end)
    counts = {key: p[key][article_name] for key in p.keys()}
    # return [counts[key] for key in sorted(counts.keys())]
    return [p[key][article_name] for key in sorted(p.keys())]
    return counts

def article_viewcounts(article_name):
    """
    Fetches a list of the Signpost article viewcount from the date of the publication window.
    The Signpost is usually published late, so a generous 14 day news "cycle" is allotted as the publication window.
    In reality views are low before publication and after publication of the next issue, so this doesn't have much effect.
    """
    pubdate = arrow.get(article_name.split("/")[1])
    enddate = (pubdate + datetime.timedelta(days=14)).strftime('%Y%m%d%H')
    pubdate = pubdate.strftime('%Y%m%d%H')
    return viewcounts(article_name, start=pubdate, end=enddate)

def total_viewcount(article_name):
    """
    Returns the total 60-day viewcount.
    """
    return sum(article_viewcounts(article_name))

def average_daily_viewcount(article_name):
    """
    Returns the average daily viewcount of the article.
    """
    counts = article_viewcounts(article_name)
    return sum(counts)/len(counts)

def get_all_articles(prefix):
    """
    Returns a list of the titles of all of the Signpost articles published after a certain prefix.
    Prefix is 2015-10-07 for now, the earliest published Signpost story for which data is available (yet).
    """
    session = mwapi.Session('https://en.wikipedia.org', user_agent='signpostviews Jupyter notebook')
    raw_result = session.get(action='query',
                             list='allpages',
                             apfrom=prefix,
                             apto='Wikipedia Signpost/A',
                             apprefix='Wikipedia Signpost',
                             apnamespace=4,
                             aplimit=500,
                             formatversion=2)
    # The >= 2 call filters out results e.g. Wikipedia:Wikipedia Signpost/2015-07-18
    # The not 2016 call keeps out titles too recent to have full data for.
    result = [r['title'] for r in raw_result['query']['allpages'] if r['title'].count("/") >= 2]
    return result

def tabulate(articles):
    pass_dict = {article: article_viewcounts(article) for article in articles}

In [146]:
[article for article in get_all_articles("Wikipedia Signpost/2015-10-07/Op-ed") if '2016' not in article]

['Wikipedia:Wikipedia Signpost/2015-10-07/Op-ed',
 'Wikipedia:Wikipedia Signpost/2015-10-07/Technology report',
 'Wikipedia:Wikipedia Signpost/2015-10-07/Traffic report',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Blog',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Editorial',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Featured content',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Gallery',
 'Wikipedia:Wikipedia Signpost/2015-10-14/News and notes',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Op-ed',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Technology report',
 'Wikipedia:Wikipedia Signpost/2015-10-14/Traffic report',
 'Wikipedia:Wikipedia Signpost/2015-10-14/WikiConference Report',
 'Wikipedia:Wikipedia Signpost/2015-10-14/WikiConference report',
 'Wikipedia:Wikipedia Signpost/2015-10-21/Arbitration report',
 'Wikipedia:Wikipedia Signpost/2015-10-21/Editorial',
 'Wikipedia:Wikipedia Signpost/2015-10-21/Featured content',
 'Wikipedia:Wikipedia Signpost/2015-10-21/In the media',
 'Wikipedia:Wiki

In [148]:
targets = [article for article in get_all_articles("Wikipedia Signpost/2015-10-07/Op-ed") if '2016' not in article]
all_views = tabulate(targets)

ERROR while fetching and parsing ['https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Wikipedia%3AWikipedia_Signpost%2F2015-12-09%2FOp-ed/daily/2015120900/2015122300']


Traceback (most recent call last):
  File "C:\Users\Alex\Desktop\signpostviews\pageviews.py", line 108, in article_views
    results = self.get_concurrent(urls)
  File "C:\Users\Alex\Desktop\signpostviews\pageviews.py", line 280, in get_concurrent
    return list(executor.map(f, urls))
  File "C:\Users\Alex\Anaconda3\lib\concurrent\futures\_base.py", line 549, in result_iterator
    yield future.result()
  File "C:\Users\Alex\Anaconda3\lib\concurrent\futures\_base.py", line 397, in result
    self._condition.wait(timeout)
  File "C:\Users\Alex\Anaconda3\lib\threading.py", line 290, in wait
    waiter.acquire()
KeyboardInterrupt


KeyboardInterrupt: 