## Signpost Article Views

I've wanted to run some basic statistics on *Wikipedia Signpost* article views for a while now, to figure out what people like or don't like reading about. Now I'm finally getting to sit down and do it...

In [5]:
from pageviews import PageviewsClient
import arrow
import datetime
import urllib
from pandas import DataFrame
import pandas as pd
import mwapi


def viewcounts(article_name, start=None, end=None):
    """
    Fetches the viewcounts.
    """
    article_name = article_name.replace(' ', '_')
    parsed_article_name = urllib.parse.quote(article_name).replace('/', '%2F')
    p = PageviewsClient().article_views("en.wikipedia",
                                        [parsed_article_name],
                                        access="all-access",
                                        # access="users",
                                        granularity="daily",
                                        start=start,
                                        end=end)
    counts = {key: p[key][article_name] for key in p.keys()}
    # return [counts[key] for key in sorted(counts.keys())]
    return [p[key][article_name] for key in sorted(p.keys())]
    return counts

def article_viewcounts(article_name):
    """
    Fetches a list of the Signpost article viewcount from the date of the publication window.
    The Signpost is usually published late, so a generous 14 day news "cycle" is allotted as the publication window.
    In reality views are low before publication and after publication of the next issue, so this doesn't have much effect.
    """
    pubdate = arrow.get(article_name.split("/")[1])
    enddate = (pubdate + datetime.timedelta(days=14)).strftime('%Y%m%d%H')
    pubdate = pubdate.strftime('%Y%m%d%H')
    return viewcounts(article_name, start=pubdate, end=enddate)

def total_viewcount(article_name):
    """
    Returns the total 60-day viewcount.
    """
    return sum(article_viewcounts(article_name))

def average_daily_viewcount(article_name):
    """
    Returns the average daily viewcount of the article.
    """
    counts = article_viewcounts(article_name)
    return sum(counts)/len(counts)

def get_all_articles(prefix):
    """
    Returns a list of the titles of all of the Signpost articles published after a certain prefix.
    Prefix is 2015-10-07 for now, the earliest published Signpost story for which data is available (yet).
    """
    session = mwapi.Session('https://en.wikipedia.org', user_agent='signpostviews Jupyter notebook')
    raw_result = session.get(action='query',
                             list='allpages',
                             apfrom=prefix,
                             apto='Wikipedia Signpost/A',
                             apprefix='Wikipedia Signpost',
                             apnamespace=4,
                             aplimit=500,
                             formatversion=2)
    # The >= 2 call filters out results e.g. Wikipedia:Wikipedia Signpost/2015-07-18
    # The not 2016 call keeps out titles too recent to have full data for.
    result = [r['title'] for r in raw_result['query']['allpages'] if r['title'].count("/") >= 2]
    return result

def tabulate(articles):
    pass_dict = {article: article_viewcounts(article) for article in articles}
    return pass_dict

In [6]:
targets = [article for article in get_all_articles("Wikipedia Signpost/2015-10-07/Op-ed") if '2016' not in article]
all_views = tabulate(targets)

In [65]:
pd.set_option('display.max_rows', None)
frame = DataFrame([all_views[key] for key in sorted(all_views.keys())],
                  index=sorted(all_views.keys()),
                  columns=range(1, 16))
# Fill missing values (not enough views to be logged so the API returns NaN) with 0.
frame = frame.fillna(0)
# Compute some per-page summary statistics.
# frame['avg'] = frame.apply(lambda x: int(sum(x) / 15), axis=1) # Not that useful because of the many empty values.
frame['total'] = frame.apply(lambda x: sum(x), axis=1)

In [66]:
# Drop entries which obviously never made it to publication.
frame = frame[frame['total'] > 200]
# Fix a particular trouble spot, where an article was renamed post-publication.
frame.ix['Wikipedia:Wikipedia Signpost/2015-10-14/WikiConference report'] = frame.ix['Wikipedia:Wikipedia Signpost/2015-10-14/WikiConference report'] + frame.ix['Wikipedia:Wikipedia Signpost/2015-10-14/WikiConference Report']
frame = frame.drop('Wikipedia:Wikipedia Signpost/2015-10-14/WikiConference Report')
frame

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,total
Wikipedia:Wikipedia Signpost/2015-10-07/Op-ed,16,3,2,8,382,663,363,270,180,169,172,117,87,22,29,2483
Wikipedia:Wikipedia Signpost/2015-10-07/Technology report,6,11,8,3,104,174,125,111,113,104,91,55,15,14,14,948
Wikipedia:Wikipedia Signpost/2015-10-07/Traffic report,7,32,5,3,203,233,148,145,146,108,102,77,30,28,14,1281
Wikipedia:Wikipedia Signpost/2015-10-14/Blog,28,154,170,122,84,88,93,86,72,6,11,18,0,0,0,932
Wikipedia:Wikipedia Signpost/2015-10-14/Editorial,52,355,268,151,109,108,111,88,79,19,39,16,0,0,0,1395
Wikipedia:Wikipedia Signpost/2015-10-14/Featured content,40,40,17,23,175,203,141,116,98,96,82,77,11,14,22,1155
Wikipedia:Wikipedia Signpost/2015-10-14/News and notes,22,13,19,27,336,262,167,104,106,112,103,82,10,11,36,1410
Wikipedia:Wikipedia Signpost/2015-10-14/Op-ed,17,4,10,10,175,241,147,101,112,103,89,78,10,14,23,1134
Wikipedia:Wikipedia Signpost/2015-10-14/Technology report,8,1,9,5,114,155,121,86,88,100,81,69,4,12,17,870
Wikipedia:Wikipedia Signpost/2015-10-14/Traffic report,17,13,8,4,165,224,141,100,95,104,95,75,10,11,65,1127
