#Data Preparation

In [3]:
# -*- coding: utf-8 -*-
%matplotlib inline
%config InlineBackend.figure_formats=['svg']

import json
import codecs
import os
import math

import numpy as np
import seaborn as sns
import datetime

import pandas as pd
import datetime as dt

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as dates

from bs4 import BeautifulSoup

from wekeypedia.wikipedia.page import WikipediaPage as Page

ImportError: No module named wekeypedia.wikipedia.page

##Data gathering

In [2]:
def gather_pages_data(dataset_dir_name, list_of_page_names):
    data_pages_dir_name = '%s/data/pages/' % dataset_dir_name
    if not(os.path.exists(data_pages_dir_name)): os.makedirs(data_pages_dir_name)

    pages_data = {}
    for page_name in list_of_page_names:
        file_name = '%s/%s.json'%(data_pages_dir_name, page_name)
        if (os.path.exists(file_name)):
            with open(file_name) as f:
                pages_data[page_name] = json.load(f)
        else:
            data = {}
            wikipage = Page(title=page_name)
            request = wikipage.fetch_info(page_name)['query']['pages']
            page_id = list(request)[0]
            if page_id!='-1':
                try:
                    for x in request[page_id]:
                        data[x]=request[page_id][x]
                    data['revisions']=wikipage.get_revisions_list()
                    data['links']= wikipage.get_links()
                    data['categories']= wikipage.get_categories()
                    pages_data[page_name]=data
                    f = open(file_name,'w')
                    f.write(json.dumps(data))
                    f.close()
                except Exception as e:
                    print 'Error with page:',page_name
                    print e
    return(pages_data)


def gather_pages_views(dataset_dir_name,list_of_page_names):
    pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name
    if not(os.path.exists(pages_views_dir_name)): os.makedirs(pages_views_dir_name)

    error = []
    for page_name in list_of_page_names:
        file_name = '%s/%s.json' % (pages_views_dir_name,page_name)
        if not (os.path.exists(file_name)):            
            try:
                wikipage = Page(title=page_name)
                page_views_ts = {k:v for x in wikipage.get_pageviews() for (k,v) in x.items()}
                f = open(filename,'w')
                f.write(json.dumps(page_views_ts))
                f.close()
            except:
                error.append(page_name)
    return(error)

##Compute pages views time series

In [4]:
def get_pages_views_time_series(dataset_dir_name,list_of_page_names):
    pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name

    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_views_daily_ts={}
    pages_views_weekly_ts={}       
    pages_views_monthly_ts={}
    pages_views_yearly_ts={}
    
    for page_name in list_of_page_names:
        file_name = '%s/%s.json'%(pages_views_dir_name,page_name)
        file_name_daily = '%s/%s.pageviews.daily.csv' % (time_series_dir_name,page_name)
        file_name_weekly = '%s/%s.pageviews.weekly.csv' % (time_series_dir_name,page_name)
        file_name_monthly = '%s/%s.pageviews.monthly.csv' % (time_series_dir_name,page_name)
        file_name_yearly = '%s/%s.pageviews.yearly.csv' % (time_series_dir_name,page_name)
    
        if os.path.exists(file_name):
            if (os.path.exists(file_name_daily)):
                pages_views_daily_ts[page_name] = pd.DataFrame().from_csv(file_name_daily)
                pages_views_weekly_ts[page_name] = pd.DataFrame().from_csv(file_name_weekly)
                pages_views_monthly_ts[page_name] = pd.DataFrame().from_csv(file_name_monthly)
                pages_views_yearly_ts[page_name] = pd.DataFrame().from_csv(file_name_yearly)
            else:
                data=json.load(open(file_name)).items()
                index = []
                series = []
                for k,v in data:
                    try:
                        index.append(pd.to_datetime(k, format="%Y-%m-%d"))
                        series.append(v)
                    except ValueError:
                        continue
                df = pd.DataFrame(series,index=index,columns=['page_views'])
                pages_views_daily_ts[page_name]=df
                pages_views_daily_ts[page_name].to_csv(file_name_daily,encoding="utf-8")
                pages_views_weekly_ts[page_name] = df.resample('W-MON', how='sum')
                pages_views_weekly_ts[page_name].to_csv(file_name_weekly,encoding="utf-8")
                pages_views_monthly_ts[page_name] = df.resample('M', how='sum')
                pages_views_monthly_ts[page_name].to_csv(file_name_monthly,encoding="utf-8")
                pages_views_yearly_ts[page_name] = df.resample('A', how='sum')
                pages_views_yearly_ts[page_name].to_csv(file_name_yearly,encoding="utf-8")
    return(pages_views_daily_ts,pages_views_weekly_ts,pages_views_monthly_ts,pages_views_yearly_ts)


#Revisions time series

In [5]:
def get_pages_revisions_time_series_gen(pages_revisions,suffixe,time_series_dir_name):
    
    revisions_daily_ts={}
    revisions_weekly_ts={}       
    revisions_monthly_ts={}
    revisions_yearly_ts={}   
    for page_name in pages_revisions.keys():
        file_name_daily = '%s/%s.%s.daily.csv'%(time_series_dir_name,suffixe,page_name)
        file_name_weekly = '%s/%s.%s.weekly.csv'%(time_series_dir_name,suffixe,page_name)
        file_name_monthly = '%s/%s.%s.monthly.csv'%(time_series_dir_name,suffixe,page_name)
        file_name_yearly = '%s/%s.%s.yearly.csv'%(time_series_dir_name,suffixe,page_name)
        
        if (os.path.exists(file_name_daily)):
            revisions_daily_ts[page_name] = pd.DataFrame().from_csv(file_name_daily)
            revisions_weekly_ts[page_name] = pd.DataFrame().from_csv(file_name_weekly)
            revisions_monthly_ts[page_name] = pd.DataFrame().from_csv(file_name_monthly)
            revisions_yearly_ts[page_name] = pd.DataFrame().from_csv(file_name_yearly)
        else:
            revisions=pages_revisions[page_name]
            
            if len(revisions)==0:
                #print 'No revisions for %s with suffixe: %s' % (page_name,suffixe)
                continue
            
            df = pd.DataFrame(revisions)
            df["datetime"] = df.timestamp.apply(pd.to_datetime)
            df["day"] = df.datetime.apply(dt.date.strftime, args=('%Y-%m-%d',))
            counts = df.groupby(df["day"])
            counts = counts.aggregate(len)
            series = counts["size"].tolist()
            index = counts.index.map(lambda x: pd.to_datetime(x, format="%Y-%m-%d"))
            df = pd.DataFrame(series,columns=['revisions'],index=index)
            
            revisions_daily_ts[page_name]=df
            revisions_daily_ts[page_name]=revisions_daily_ts[page_name].fillna(0)
            revisions_daily_ts[page_name].to_csv(file_name_daily,encoding="utf-8")
            
            revisions_weekly_ts[page_name]=df.resample('W-MON', how='sum')
            revisions_weekly_ts[page_name]=revisions_weekly_ts[page_name].fillna(0)           
            revisions_weekly_ts[page_name].to_csv(file_name_weekly,encoding="utf-8")

            revisions_monthly_ts[page_name]=df.resample('M', how='sum')
            revisions_monthly_ts[page_name]=revisions_monthly_ts[page_name].fillna(0)
            revisions_monthly_ts[page_name].to_csv(file_name_monthly,encoding="utf-8")
            
            revisions_yearly_ts[page_name]=df.resample('A', how='sum')
            revisions_yearly_ts[page_name]=revisions_yearly_ts[page_name].fillna(0)
            revisions_yearly_ts[page_name].to_csv(file_name_yearly,encoding="utf-8")
    
    return(revisions_daily_ts,revisions_weekly_ts,revisions_monthly_ts,revisions_yearly_ts)


## ## ## ## ## ## ## ## ## ##


def get_pages_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)
    
    data = {}
    for k,v in pages_data.iteritems():
            data[k]=v['revisions']
    
    return get_pages_revisions_time_series_gen(data,'revisions',time_series_dir_name)

def get_pages_ip_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    
   
    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)

    data = {}
    for k,v in pages_data.iteritems():
            data[k]=[x for x in v['revisions'] if ('userid' in x and x['userid']==0)]
    return get_pages_revisions_time_series_gen(data,'revisions.ip',time_series_dir_name)

def get_pages_bot_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)

    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('user' in x and 'bot' in x['user'].lower())]
    
    return get_pages_revisions_time_series_gen(data,'revisions.bot',time_series_dir_name)

def get_pages_members_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)

    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('user' in x and x['userid']!=0 and 'bot' not in x['user'].lower())]
    
    return get_pages_revisions_time_series_gen(data,'revisions.members',time_series_dir_name)
    

##Gather and compute data

In [2]:
def gather_and_compute_data(dataset_dir_name,list_of_page_names):
    gather_pages_data(dataset_dir_name, list_of_page_names)
    gather_pages_views(dataset_dir_name,list_of_page_names)
    get_pages_views_time_series(dataset_dir_name,list_of_page_names)
    get_pages_members_revisions_time_series(dataset_dir_name,list_of_page_names)
    get_pages_bot_revisions_time_series(dataset_dir_name,list_of_page_names)
    get_pages_ip_revisions_time_series(dataset_dir_name,list_of_page_names)
    get_pages_revisions_time_series(dataset_dir_name,list_of_page_names) 
    

##Statistics computation

In [None]:
def stat_computation(dataset_dir_name,list_of_page_names):
    pages_data=gather_pages_data(dataset_dir_name, list_of_page_names)
    df = pd.DataFrame(index=pages_data.keys())
    
    #pageid
    data={k:v['pageid'] for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Page_id'] = data[k]
    #length
    data={k:v['length'] for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Length'] = data[k]
    #namespace
    data={k:v['ns'] for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Namespace'] = data[k]
    #nombre de revisions
    data={k:len(v['revisions']) for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions'] = data[k]
    #nombre de revisions by IP
    data={k:len([x for x in v['revisions'] if ('userid' in x and x['userid']==0)])
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions_IP'] = data[k]
    #nombre de revisions by Bot
    data={k:len([x for x in v['revisions'] if ('user' in x and 'bot' in x['user'].lower())])
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions_Bot'] = data[k]
    #nombre de revisions by Alive Registered Members
    data={k:len([x for x in v['revisions'] if ('user' in x and x['userid']!=0 and 'bot' not in x['user'].lower())])
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions_wiki'] = data[k]
    #nombre de contributeurs
    data={k:len(set([x['user'] for x in v['revisions'] if 'user' in x]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors'] = data[k]
    #nombre de contributeurs IP
    data={k:len(set([x['user'] for x in v['revisions'] if 'userid' in x and x['userid']==0]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors_IP'] = data[k]
    #nombre de contributeurs Bot
    data={k:len(set([x['user'] for x in v['revisions'] if 'user' in x and 'bot' in x['user']]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors_Bot'] = data[k]
    #nombre de contributeurs by Alive Registered Members
    data={k:len(set([x['user'] for x in v['revisions'] if 'user' in x and x['userid']!=0 and 'bot' not in x['user']]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors_wiki'] = data[k]
    #nombre de revisions
    data={k:len(v['links']) for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Links'] = data[k]
    #date of the first contibutions (in number of days after the start of the wikipedia project)
    def numberOfDaysAfter(date):
        return( (datetime.datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ")-datetime.datetime.strptime("2001-01-15T00:00:00Z","%Y-%m-%dT%H:%M:%SZ")).days)
    data={k:min(map(numberOfDaysAfter,map(lambda x: x['timestamp'],v['revisions'])))  for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Date'] = data[k]
    #number of pageviews in 2011
    a,b,c,data = get_pages_views_time_series(dataset_dir_name,list_of_page_names)
    for k in pages_data.keys(): df.ix[k,'Page_views_2010'] = data[k]['page_views']['2010-12-31']

    #quality
    def get_quality(page):
        talk_name = 'Talk:'+page
        talk = gather_pages_data(dataset_dir_name, [ talk_name ])
        if len(talk)==0: return 0,0
        quality=0
        islist=0
        for c in talk[talk_name]['categories']:
            cat = c['title'].lower()
            if '-class' in cat:
                if 'fa-class' in cat or 'fl-class' in cat: quality = max(quality,7)
                if 'a-class' in cat and 'ga-class' not in cat : quality = max(quality,6)
                if 'ga-class' in cat: quality = max(quality,5)
                if 'b-class' in cat and 'stub-class' not in cat: quality = max(quality,4)
                if 'c-class' in cat: quality = max(quality,3)
                if 'start-class' in cat: quality = max(quality,2)
                if 'stub-class' in cat: quality = max(quality,1)
                if 'fl-class' in cat or 'list-class' in cat: islist=1
        return quality,islist
    
    for k in pages_data.keys(): 
        quality,islist=get_quality(k)
        df.ix[k,'Quality'] = quality
        df.ix[k,'Is_list'] = islist

        
    return(df)


# yearly time series revisions

NameError: global name 'gather_pages_data' is not defined

#Report for evolution of revisions and pageviews for a set of pages

In [1]:
from IPython.display import display, HTML

def compute_sum_ts(dic_ts,col_name):
    df = pd.DataFrame()
    for title in dic_ts:
        if (len(df.columns)==0):
            df = dic_ts[title].copy()
        else:
            tmp = dic_ts[title].copy()
            tmp.columns = [x.replace(col_name,'tmp') for x in tmp.columns]
            df = df.join(tmp,how='outer').fillna(0)
            df['sum'] = df[col_name] + df['tmp']
            df.drop(col_name, axis=1, inplace=True)
            df.drop('tmp', axis=1, inplace=True)
            df.columns = [col_name]
    return(df)

        
        
def get_report_set_of_pages(revisions_ip_ts,revisions_bot_ts,revisions_members_ts,pageviews_ts,ip=True,bot=True,member=True):
    display(HTML("<h2>Evolution of pageviews and revisions</h2>" ))
   
    df_pageviews = compute_sum_ts(pageviews_ts,'page_views')

    df_ip = compute_sum_ts(revisions_ip_ts,'revisions_ip').join(df_pageviews,how='outer').fillna(0)  
    df_bot = compute_sum_ts(revisions_bot_ts,'revisions_bot').join(df_pageviews,how='outer').fillna(0)
    df_members = compute_sum_ts(revisions_members_ts,'revisions_members').join(df_pageviews,how='outer').fillna(0)

    if ip:
        display(HTML("<h3>Evolution of pageviews and revisions by ip</h3>" ))

        df_ip['revisions_ip'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_ip['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()
        
    if member:      
        display(HTML("<h3>Evolution of pageviews and revisions by members</h3>" ))

        df_members['revisions_members'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_members['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()

    if bot:
        display(HTML("<h3>Evolution of pageviews and revisions by bot</h2>" ))

        df_bot['revisions_bot'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_bot['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()

    

def get_monthly_report_set_of_pages(dataset_dir_name,list_of_page_names,ip=True,bot=True,member=True):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name

    a,b,revisions_ip_ts,c = get_pages_ip_revisions_time_series(dataset_dir_name,list_of_page_names)
    a,b,revisions_bot_ts,c = get_pages_bot_revisions_time_series(dataset_dir_name,list_of_page_names)
    a,b,revisions_members_ts,c = get_pages_members_revisions_time_series(dataset_dir_name,list_of_page_names)

    for title in list_of_page_names:
        try:
            revisions_ip_ts[title].columns = [x.replace('revisions','revisions_ip') for x in revisions_ip_ts[title].columns]
        except :
            print 'No revisions ts found for ip for page',title
        try:
            revisions_bot_ts[title].columns = [x.replace('revisions','revisions_bot') for x in revisions_bot_ts[title].columns]
        except :
            print 'No revisions ts found for bot for page',title
        try:
            revisions_members_ts[title].columns = [x.replace('revisions','revisions_members') for x in revisions_members_ts[title].columns]
        except :
            print 'No revisions ts found for members for page',title

    pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name
    a,b,pages_views_ts,c = get_pages_views_time_series(dataset_dir_name,list_of_page_names)

   
    get_report_set_of_pages(revisions_ip_ts,revisions_bot_ts,revisions_members_ts,pages_views_ts,ip=ip,bot=bot,member=member)
    
    
def get_yearly_report_set_of_pages(dataset_dir_name,list_of_page_names,ip=True,bot=True,member=True):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name

    a,b,c,revisions_ip_ts = get_pages_ip_revisions_time_series(dataset_dir_name,list_of_page_names)
    a,b,c,revisions_bot_ts = get_pages_bot_revisions_time_series(dataset_dir_name,list_of_page_names)
    a,b,c,revisions_members_ts = get_pages_members_revisions_time_series(dataset_dir_name,list_of_page_names)

    for title in list_of_page_names:
        try:
            revisions_ip_ts[title].columns = [x.replace('revisions','revisions_ip') for x in revisions_ip_ts[title].columns]
        except :
            print 'No revisions ts found for ip for page',title
        try:
            revisions_bot_ts[title].columns = [x.replace('revisions','revisions_bot') for x in revisions_bot_ts[title].columns]
        except :
            print 'No revisions ts found for bot for page',title
        try:
            revisions_members_ts[title].columns = [x.replace('revisions','revisions_members') for x in revisions_members_ts[title].columns]
        except :
            print 'No revisions ts found for members for page',title

    pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name
    a,b,c,pages_views_ts = get_pages_views_time_series(dataset_dir_name,list_of_page_names)

   
    get_report_set_of_pages(revisions_ip_ts,revisions_bot_ts,revisions_members_ts,pages_views_ts,ip=ip,bot=bot,member=member)
    