#Data Preparation

In [2]:
# -*- coding: utf-8 -*-
%matplotlib inline
%config InlineBackend.figure_formats=['svg']

import json
import os
import hashlib
import sys
import math
import codecs

import numpy as np
import seaborn as sns
import datetime

import pandas as pd
import datetime as dt

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as dates

from bs4 import BeautifulSoup
import networkx as nx

from IPython.display import display, HTML

from wekeypedia.wikipedia.page import WikipediaPage as Page

ImportError: No module named wekeypedia.wikipedia.page

##Data gathering

In [None]:
def get_path_to_file(name):
    hashmd5 = hashlib.md5(name.encode('utf-8')).hexdigest()
    path = '/'+hashmd5[0]+'/'+hashmd5[1]+'/'+hashmd5[2]+'/'+hashmd5[3]+'/'
    return path,hashmd5

def load_pages_data(dataset_dir_name, list_of_page_names):
    data_pages_dir_name = '%s/data/pages/' % dataset_dir_name
    if not(os.path.exists(data_pages_dir_name)): os.makedirs(data_pages_dir_name)

    pages_data = {}
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        if not(os.path.exists( '%s/%s' % (data_pages_dir_name, path) )):
            os.makedirs( '%s/%s' % (data_pages_dir_name, path) )
        file_name = '%s/%s/%s.json'%(data_pages_dir_name, path, hashmd5)
        if (os.path.exists(file_name)):
            with open(file_name) as f:
                pages_data[page_name] = json.load(f)
        else:
            pass
    return pages_data


def gather_pages_data(dataset_dir_name, list_of_page_names):
    data_pages_dir_name = '%s/data/pages/' % dataset_dir_name
    if not(os.path.exists(data_pages_dir_name)): os.makedirs(data_pages_dir_name)

    pages_data = {}
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        if not(os.path.exists( '%s/%s' % (data_pages_dir_name, path) )):
            os.makedirs( '%s/%s' % (data_pages_dir_name, path) )
        file_name = '%s/%s/%s.json'%(data_pages_dir_name, path, hashmd5)
        if (os.path.exists(file_name)):
            with open(file_name) as f:
                pages_data[page_name] = json.load(f)
        else:
            data = {}
            wikipage = Page(title=page_name)
            request = wikipage.fetch_info(page_name)['query']['pages']
            page_id = list(request)[0]
            if page_id!='-1':
                try:
                    for x in request[page_id]:
                        data[x]=request[page_id][x]
                    data['revisions']=wikipage.get_revisions_list()
                    data['links']= wikipage.get_links()
                    data['backlinks']= wikipage.get_backlinks()
                    data['categories']= wikipage.get_categories()
                    pages_data[page_name]=data
                    f = open(file_name,'w')
                    f.write(json.dumps(data))
                    f.close()
                except Exception as e:
                    print 'Error with page:',page_name
                    print e
    return(pages_data)


def gather_pages_views(dataset_dir_name,list_of_page_names):
    pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name
    if not(os.path.exists(pages_views_dir_name)): os.makedirs(pages_views_dir_name)

    error = []
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        if not(os.path.exists( '%s/%s' % (pages_views_dir_name,path) )):
            os.makedirs( '%s/%s' % (pages_views_dir_name,path) )
        file_name = '%s/%s/%s.json' % (pages_views_dir_name,path, hashmd5)        
        if not (os.path.exists(file_name)):            
            try:
                wikipage = Page(title=page_name)
                page_views_ts = {k:v for x in wikipage.get_pageviews() for (k,v) in x.items()}
                f = open(file_name,'w')
                f.write(json.dumps(page_views_ts))
                f.close()
            except:
                error.append(page_name)
    return(error)

In [None]:
#get quality
def get_quality(page):
    talk_name = 'Talk:'+page
    talk = gather_pages_data(dataset_dir_name, [ talk_name ])
    if len(talk)==0: return 0,0,0
    quality=1
    impor=0
    islist=0
    for c in talk[talk_name]['categories']:
        cat = c['title'].lower()
        if '-class' in cat:
            if 'fa-class' in cat or 'fl-class' in cat: quality = max(quality,6)
            if 'a-class' in cat or 'ga-class' in cat : quality = max(quality,5)
            #if 'ga-class' in cat: quality = max(quality,5)
            if 'b-class' in cat and 'stub-class' not in cat: quality = max(quality,4)
            if 'c-class' in cat: quality = max(quality,3)
            if 'start-class' in cat: quality = max(quality,2)
            if 'stub-class' in cat: quality = max(quality,1)
            if 'fl-class' in cat or 'list-class' in cat: islist=1
        if 'importance' in cat:
            if 'top' in cat: impor=max(impor,4)
            if 'high' in cat: impor=max(impor,3)
            if 'mid' in cat: impor=max(impor,2)
            if 'low' in cat: impor=max(impor,1)            
    return (quality,impor,islist)

def stat_computation(dataset_dir_name,list_of_page_names):
    df = pd.DataFrame(index=list_of_page_names)
    for  page in list_of_page_names:
        pages_data=gather_pages_data(dataset_dir_name, [page])
        if len(pages_data)>0:
            #length
            data={k:v['length'] for k,v in pages_data.items()}
            for k in pages_data.keys(): df.ix[k,'Length'] = data[k]

            #nombre de revisions
            a,b,c,data = get_pages_revisions_time_series(dataset_dir_name,pages_data.keys() )
            for k in data.keys(): df.ix[k,'revisions_2001'] = data[k]['2000-12-31':'2001-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_2002'] = data[k]['2001-12-31':'2002-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_2003'] = data[k]['2002-12-31':'2003-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_2004'] = data[k]['2003-12-31':'2004-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_2005'] = data[k]['2004-12-31':'2005-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_2006'] = data[k]['2005-12-31':'2006-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_2007'] = data[k]['2006-12-31':'2007-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_2008'] = data[k]['2007-12-31':'2008-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_2009'] = data[k]['2008-12-31':'2009-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_2010'] = data[k]['2009-12-31':'2010-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_2011'] = data[k]['2010-12-31':'2011-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_2012'] = data[k]['2011-12-31':'2012-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_2013'] = data[k]['2012-12-31':'2013-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_2014'] = data[k]['2013-12-31':'2014-12-31'].sum()['revisions']
            
            
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_bef_2002'] = data[k]['1998-12-31':'2001-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_bef_2003'] = data[k]['1998-12-31':'2002-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_bef_2004'] = data[k]['1998-12-31':'2003-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_bef_2005'] = data[k]['1998-12-31':'2004-12-31'].sum()['revisions']
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_bef_2006'] = data[k]['1998-12-31':'2005-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_bef_2007'] = data[k]['1998-12-31':'2006-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_bef_2008'] = data[k]['1998-12-31':'2007-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_bef_2009'] = data[k]['1998-12-31':'2008-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_bef_2010'] = data[k]['1998-12-31':'2009-12-31'].sum()['revisions']
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_bef_2011'] = data[k]['1998-12-31':'2010-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_bef_2012'] = data[k]['1998-12-31':'2011-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_bef_2013'] = data[k]['1998-12-31':'2012-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_bef_2014'] = data[k]['1998-12-31':'2013-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_bef_2015'] = data[k]['1998-12-31':'2014-12-31'].sum()['revisions']

                
            #nombre de revisions by members
            a,b,c,data = get_pages_members_revisions_time_series(dataset_dir_name,pages_data.keys() )
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_members_2001'] = data[k]['2000-12-31':'2001-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_members_2002'] = data[k]['2001-12-31':'2002-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_members_2003'] = data[k]['2002-12-31':'2003-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_members_2004'] = data[k]['2003-12-31':'2004-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_members_2005'] = data[k]['2004-12-31':'2005-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_members_2006'] = data[k]['2005-12-31':'2006-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_members_2007'] = data[k]['2006-12-31':'2007-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_members_2008'] = data[k]['2007-12-31':'2008-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_members_2009'] = data[k]['2008-12-31':'2009-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_members_2010'] = data[k]['2009-12-31':'2010-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_members_2011'] = data[k]['2010-12-31':'2011-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_members_2012'] = data[k]['2011-12-31':'2012-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_members_2013'] = data[k]['2012-12-31':'2013-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_members_2014'] = data[k]['2013-12-31':'2014-12-31'].sum()['revisions']
            
            
            
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_members_bef_2002'] = data[k]['1998-12-31':'2001-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_members_bef_2003'] = data[k]['1998-12-31':'2002-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_members_bef_2004'] = data[k]['1998-12-31':'2003-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_members_bef_2005'] = data[k]['1998-12-31':'2004-12-31'].sum()['revisions']
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_members_bef_2006'] = data[k]['1998-12-31':'2005-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_members_bef_2007'] = data[k]['1998-12-31':'2006-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_members_bef_2008'] = data[k]['1998-12-31':'2007-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_members_bef_2009'] = data[k]['1998-12-31':'2008-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_members_bef_2010'] = data[k]['1998-12-31':'2009-12-31'].sum()['revisions']          
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_members_bef_2011'] = data[k]['1998-12-31':'2010-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_members_bef_2012'] = data[k]['1998-12-31':'2011-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_members_bef_2013'] = data[k]['1998-12-31':'2012-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_members_bef_2014'] = data[k]['1998-12-31':'2013-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_members_bef_2015'] = data[k]['1998-12-31':'2014-12-31'].sum()['revisions']
      
    
            #nombre de revisions by IP
            a,b,c,data = get_pages_ip_revisions_time_series(dataset_dir_name,pages_data.keys() )
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_ip_2001'] = data[k]['2000-12-31':'2001-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_ip_2002'] = data[k]['2001-12-31':'2002-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_ip_2003'] = data[k]['2002-12-31':'2003-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_ip_2004'] = data[k]['2003-12-31':'2004-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_ip_2005'] = data[k]['2004-12-31':'2005-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_ip_2006'] = data[k]['2005-12-31':'2006-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_ip_2007'] = data[k]['2006-12-31':'2007-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_ip_2008'] = data[k]['2007-12-31':'2008-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_ip_2009'] = data[k]['2008-12-31':'2009-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_ip_2010'] = data[k]['2009-12-31':'2010-12-31'].sum()['revisions']
            #number of revisions in 2011
            for k in data.keys(): df.ix[k,'revisions_ip_2011'] = data[k]['2010-12-31':'2011-12-31'].sum()['revisions']
            #number of revisions in 2012
            for k in data.keys(): df.ix[k,'revisions_ip_2012'] = data[k]['2011-12-31':'2012-12-31'].sum()['revisions']
            #number of revisions in 2013
            for k in data.keys(): df.ix[k,'revisions_ip_2013'] = data[k]['2012-12-31':'2013-12-31'].sum()['revisions']
            #number of revisions in 2014
            for k in data.keys(): df.ix[k,'revisions_ip_2014'] = data[k]['2013-12-31':'2014-12-31'].sum()['revisions']
            
            
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2002'] = data[k]['1998-12-31':'2001-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2003'] = data[k]['1998-12-31':'2002-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2004'] = data[k]['1998-12-31':'2003-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2005'] = data[k]['1998-12-31':'2004-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2006'] = data[k]['1998-12-31':'2005-12-31'].sum()['revisions']
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2007'] = data[k]['1998-12-31':'2006-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2008'] = data[k]['1998-12-31':'2007-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2009'] = data[k]['1998-12-31':'2008-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2010'] = data[k]['1998-12-31':'2009-12-31'].sum()['revisions']
            #number of revisions before 2010
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2011'] = data[k]['1998-12-31':'2010-12-31'].sum()['revisions']
            #number of revisions before 2011
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2012'] = data[k]['1998-12-31':'2011-12-31'].sum()['revisions']
            #number of revisions before 2012
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2013'] = data[k]['1998-12-31':'2012-12-31'].sum()['revisions']
            #number of revisions before 2013
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2014'] = data[k]['1998-12-31':'2013-12-31'].sum()['revisions']
            #number of revisions before 2014
            for k in data.keys(): df.ix[k,'revisions_ip_bef_2015'] = data[k]['1998-12-31':'2014-12-31'].sum()['revisions']


            #number of pageviews in 2010
            a,b,c,data = get_pages_views_time_series(dataset_dir_name , pages_data.keys() )
            #number of pageviews in 2012
            for k in data.keys(): df.ix[k,'Page_views_2007'] = data[k]['page_views']['2007-12-31']
            #number of pageviews in 2013
            for k in data.keys(): df.ix[k,'Page_views_2008'] = data[k]['page_views']['2008-12-31']
            #number of pageviews in 2014
            for k in data.keys(): df.ix[k,'Page_views_2009'] = data[k]['page_views']['2009-12-31']
            #number of pageviews in 2014
            for k in data.keys(): df.ix[k,'Page_views_2010'] = data[k]['page_views']['2010-12-31']
            #number of pageviews in 2011
            for k in data.keys(): df.ix[k,'Page_views_2011'] = data[k]['page_views']['2011-12-31']
            #number of pageviews in 2012
            for k in data.keys(): df.ix[k,'Page_views_2012'] = data[k]['page_views']['2012-12-31']
            #number of pageviews in 2013
            for k in data.keys(): df.ix[k,'Page_views_2013'] = data[k]['page_views']['2013-12-31']
            #number of pageviews in 2014
            for k in data.keys(): df.ix[k,'Page_views_2014'] = data[k]['page_views']['2014-12-31']

            for k in pages_data.keys(): 
                quality,impor,islist=get_quality(k)
                df.ix[k,'Quality'] = quality
                df.ix[k,'Importance'] = impor
                df.ix[k,'Is_list'] = islist

            
            #date of the first contibutions (in number of days after the start of the wikipedia project)
            def numberOfDaysAfter(date):
                return( (datetime.datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ")-datetime.datetime.strptime("2001-01-15T00:00:00Z","%Y-%m-%dT%H:%M:%SZ")).days)
            data={k:min(map(numberOfDaysAfter,map(lambda x: x['timestamp'],v['revisions'])))  for k,v in pages_data.items()}
            for k in pages_data.keys(): df.ix[k,'Date'] = data[k]
        
    return(df)

##Compute pages views time series

In [None]:
def get_pages_views_time_series(dataset_dir_name,list_of_page_names):
    pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name

    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_views_daily_ts={}
    pages_views_weekly_ts={}       
    pages_views_monthly_ts={}
    pages_views_yearly_ts={}
    
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        file_name = '%s/%s/%s.json' % (pages_views_dir_name,path, hashmd5)
        if not(os.path.exists( '%s/%s' % (time_series_dir_name,path) )):
            os.makedirs( '%s/%s' % (time_series_dir_name,path) )
        file_name_daily = '%s/%s/%s.pageviews.daily.csv' % (time_series_dir_name,path,hashmd5)
        file_name_weekly = '%s/%s/%s.pageviews.weekly.csv' % (time_series_dir_name,path,hashmd5)
        file_name_monthly = '%s/%s/%s.pageviews.monthly.csv' % (time_series_dir_name,path,hashmd5)
        file_name_yearly = '%s/%s/%s.pageviews.yearly.csv' % (time_series_dir_name,path,hashmd5)
    
        if os.path.exists(file_name):
            if (os.path.exists(file_name_daily)):
                pages_views_daily_ts[page_name] = pd.DataFrame().from_csv(file_name_daily)
                pages_views_weekly_ts[page_name] = pd.DataFrame().from_csv(file_name_weekly)
                pages_views_monthly_ts[page_name] = pd.DataFrame().from_csv(file_name_monthly)
                pages_views_yearly_ts[page_name] = pd.DataFrame().from_csv(file_name_yearly)
            else:
                data=json.load(open(file_name)).items()
                index = []
                series = []
                for k,v in data:
                    try:
                        index.append(pd.to_datetime(k, format="%Y-%m-%d"))
                        series.append(v)
                    except ValueError:
                        continue
                df = pd.DataFrame(series,index=index,columns=['page_views'])
                pages_views_daily_ts[page_name]=df
                pages_views_daily_ts[page_name].to_csv(file_name_daily,encoding="utf-8")
                pages_views_weekly_ts[page_name] = df.resample('W-MON', how='sum')
                pages_views_weekly_ts[page_name].to_csv(file_name_weekly,encoding="utf-8")
                pages_views_monthly_ts[page_name] = df.resample('M', how='sum')
                pages_views_monthly_ts[page_name].to_csv(file_name_monthly,encoding="utf-8")
                pages_views_yearly_ts[page_name] = df.resample('A', how='sum')
                pages_views_yearly_ts[page_name].to_csv(file_name_yearly,encoding="utf-8")
    return(pages_views_daily_ts,pages_views_weekly_ts,pages_views_monthly_ts,pages_views_yearly_ts)


#Revisions time series

In [None]:
def get_pages_revisions_time_series_gen(pages_revisions,suffixe,time_series_dir_name):
    
    revisions_daily_ts={}
    revisions_weekly_ts={}       
    revisions_monthly_ts={}
    revisions_yearly_ts={}   
    for page_name in pages_revisions.keys():
        path,hashmd5 = get_path_to_file(page_name)
        if not(os.path.exists( '%s/%s' % (time_series_dir_name,path) )):
            os.makedirs( '%s/%s' % (time_series_dir_name,path) )

        file_name_daily = '%s/%s/%s.%s.daily.csv'%(time_series_dir_name,path,suffixe,hashmd5)
        file_name_weekly = '%s/%s/%s.%s.weekly.csv'%(time_series_dir_name,path,suffixe,hashmd5)
        file_name_monthly = '%s/%s/%s.%s.monthly.csv'%(time_series_dir_name,path,suffixe,hashmd5)
        file_name_yearly = '%s/%s/%s.%s.yearly.csv'%(time_series_dir_name,path,suffixe,hashmd5)
        
        if (os.path.exists(file_name_daily)):
            revisions_daily_ts[page_name] = pd.DataFrame().from_csv(file_name_daily)
            revisions_weekly_ts[page_name] = pd.DataFrame().from_csv(file_name_weekly)
            revisions_monthly_ts[page_name] = pd.DataFrame().from_csv(file_name_monthly)
            revisions_yearly_ts[page_name] = pd.DataFrame().from_csv(file_name_yearly)
        else:
            revisions=pages_revisions[page_name]
            
            if len(revisions)==0:
                #print 'No revisions for %s with suffixe: %s' % (page_name,suffixe)
                continue
            
            df = pd.DataFrame(revisions)
            df["datetime"] = df.timestamp.apply(pd.to_datetime)
            df["day"] = df.datetime.apply(dt.date.strftime, args=('%Y-%m-%d',))
            counts = df.groupby(df["day"])
            counts = counts.aggregate(len)
            series = counts["size"].tolist()
            index = counts.index.map(lambda x: pd.to_datetime(x, format="%Y-%m-%d"))
            df = pd.DataFrame(series,columns=['revisions'],index=index)
            
            revisions_daily_ts[page_name]=df
            revisions_daily_ts[page_name]=revisions_daily_ts[page_name].fillna(0)
            revisions_daily_ts[page_name].to_csv(file_name_daily,encoding="utf-8")
            
            revisions_weekly_ts[page_name]=df.resample('W-MON', how='sum')
            revisions_weekly_ts[page_name]=revisions_weekly_ts[page_name].fillna(0)           
            revisions_weekly_ts[page_name].to_csv(file_name_weekly,encoding="utf-8")

            revisions_monthly_ts[page_name]=df.resample('M', how='sum')
            revisions_monthly_ts[page_name]=revisions_monthly_ts[page_name].fillna(0)
            revisions_monthly_ts[page_name].to_csv(file_name_monthly,encoding="utf-8")
            
            revisions_yearly_ts[page_name]=df.resample('A', how='sum')
            revisions_yearly_ts[page_name]=revisions_yearly_ts[page_name].fillna(0)
            revisions_yearly_ts[page_name].to_csv(file_name_yearly,encoding="utf-8")
    
    return(revisions_daily_ts,revisions_weekly_ts,revisions_monthly_ts,revisions_yearly_ts)


## ## ## ## ## ## ## ## ## ##


def get_pages_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)
    
    data = {}
    for k,v in pages_data.iteritems():
            data[k]=v['revisions']
    
    return get_pages_revisions_time_series_gen(data,'revisions',time_series_dir_name)

def get_pages_ip_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    
   
    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)

    data = {}
    for k,v in pages_data.iteritems():
            data[k]=[x for x in v['revisions'] if ('userid' in x and x['userid']==0)]
    return get_pages_revisions_time_series_gen(data,'revisions.ip',time_series_dir_name)

def get_pages_bot_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)

    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('user' in x and 'bot' in x['user'].lower())]
    
    return get_pages_revisions_time_series_gen(data,'revisions.bot',time_series_dir_name)

def get_pages_members_revisions_time_series(dataset_dir_name,list_of_page_names):
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    if not(os.path.exists(time_series_dir_name)): os.makedirs(time_series_dir_name)    

    pages_data = gather_pages_data(dataset_dir_name,list_of_page_names)

    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('user' in x and x['userid']!=0 and 'bot' not in x['user'].lower())]
    
    return get_pages_revisions_time_series_gen(data,'revisions.members',time_series_dir_name)
    

##Gather and compute data

In [None]:
def gather_and_compute_data(dataset_dir_name,list_of_page_names):
    gather_pages_data(dataset_dir_name, list_of_page_names)
    gather_pages_views(dataset_dir_name,list_of_page_names)
    get_pages_views_time_series(dataset_dir_name,list_of_page_names)
    get_pages_members_revisions_time_series(dataset_dir_name,list_of_page_names)
    get_pages_bot_revisions_time_series(dataset_dir_name,list_of_page_names)
    get_pages_ip_revisions_time_series(dataset_dir_name,list_of_page_names)
    get_pages_revisions_time_series(dataset_dir_name,list_of_page_names) 
    

##Statistics computation

#Report for evolution of revisions and pageviews for a set of pages

In [None]:
from IPython.display import display, HTML

def compute_sum_ts_weekly(dataset_dir_name,list_of_page_names,function,col_name):
    df = pd.DataFrame()
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    for title in list_of_page_names:
        try:
            a,dic_ts,c,d = function(dataset_dir_name,[title])
            if title in dic_ts.keys():
                if (len(df.columns)==0):
                    df = dic_ts[title].copy()
                else:
                    tmp = dic_ts[title].copy()
                    tmp.columns = [x.replace(col_name,'tmp') for x in tmp.columns]
                    df = df.join(tmp,how='outer').fillna(0)
                    df['sum'] = df[col_name] + df['tmp']
                    df.drop(col_name, axis=1, inplace=True)
                    df.drop('tmp', axis=1, inplace=True)
                    df.columns = [col_name]
            else:
                pass
        except:
            a,dic_ts,c,d = function(dataset_dir_name,[title])
            print dic_ts
            return(df)
    return(df)

        
        
def get_report_set_of_pages(dataset_dir_name,list_of_page_names,ip=True,bot=True,member=True):
    display(HTML("<h2>Evolution of pageviews and revisions</h2>" ))
   
    df_pageviews = compute_sum_ts_weekly(dataset_dir_name,list_of_page_names,get_pages_views_time_series,'page_views')
    
    

    if ip:
        display(HTML("<h3>Evolution of pageviews and revisions by ip</h3>" ))
        df_ip = compute_sum_ts_weekly(dataset_dir_name,list_of_page_names,get_pages_ip_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)  

        df_ip['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_ip['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()
        
    if member:      
        display(HTML("<h3>Evolution of pageviews and revisions by members</h3>" ))
        df_members = compute_sum_ts_weekly(dataset_dir_name,list_of_page_names,get_pages_members_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)
        df_members['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_members['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()

    if bot:
        display(HTML("<h3>Evolution of pageviews and revisions by bot</h2>" ))
        df_bot = compute_sum_ts_weekly(dataset_dir_name,list_of_page_names,get_pages_bot_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)
        df_bot['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_bot['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()

    


In [None]:
def compute_sum_ts_monthly(dataset_dir_name,list_of_page_names,function,col_name):
    df = pd.DataFrame()
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    
    for title in list_of_page_names:
        a,b,dic_ts,d = function(dataset_dir_name,[title])
        if title in dic_ts.keys():
            if (len(df.columns)==0):
                df = dic_ts[title].copy()
            else:
                tmp = dic_ts[title].copy()
                tmp.columns = [x.replace(col_name,'tmp') for x in tmp.columns]
                df = df.join(tmp,how='outer').fillna(0)
                df['sum'] = df[col_name] + df['tmp']
                df.drop(col_name, axis=1, inplace=True)
                df.drop('tmp', axis=1, inplace=True)
                df.columns = [col_name]
        else:
            pass
    return(df)

        
        
def get_monthly_report_set_of_pages(dataset_dir_name,list_of_page_names,ip=True,bot=True,member=True):
    display(HTML("<h2>Evolution of pageviews and revisions</h2>" ))
   
    df_pageviews = compute_sum_ts_monthly(dataset_dir_name,list_of_page_names,get_pages_views_time_series,'page_views')
    
    if ip:
        display(HTML("<h3>Evolution of pageviews and revisions by ip</h3>" ))
        df_ip = compute_sum_ts_monthly(dataset_dir_name,list_of_page_names,get_pages_ip_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)  

        df_ip['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_ip['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()
        
    if member:      
        display(HTML("<h3>Evolution of pageviews and revisions by members</h3>" ))
        df_members = compute_sum_ts_monthly(dataset_dir_name,list_of_page_names,get_pages_members_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)

        df_members['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_members['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()

    if bot:
        display(HTML("<h3>Evolution of pageviews and revisions by bot</h2>" ))
        df_bot = compute_sum_ts_monthly(dataset_dir_name,list_of_page_names,get_pages_bot_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)

        df_bot['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_bot['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()


In [None]:
def compute_sum_ts_yearly(dataset_dir_name,list_of_page_names,function,col_name):
    df = pd.DataFrame()
    time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
    
    for title in list_of_page_names:
        a,b,c,dic_ts = function(dataset_dir_name,[title])
        if title in dic_ts.keys():
            if (len(df.columns)==0):
                df = dic_ts[title].copy()
            else:
                tmp = dic_ts[title].copy()
                tmp.columns = [x.replace(col_name,'tmp') for x in tmp.columns]
                df = df.join(tmp,how='outer').fillna(0)
                df['sum'] = df[col_name] + df['tmp']
                df.drop(col_name, axis=1, inplace=True)
                df.drop('tmp', axis=1, inplace=True)
                df.columns = [col_name]
        else:
            pass
    return(df)

        
        
def get_yearly_report_set_of_pages(dataset_dir_name,list_of_page_names,ip=True,bot=True,member=True):
    display(HTML("<h2>Evolution of pageviews and revisions</h2>" ))
   
    df_pageviews = compute_sum_ts_yearly(dataset_dir_name,list_of_page_names,get_pages_views_time_series,'page_views')
    
    if ip:
        display(HTML("<h3>Evolution of pageviews and revisions by ip</h3>" ))
        df_ip = compute_sum_ts_yearly(dataset_dir_name,list_of_page_names,get_pages_ip_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)  

        df_ip['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_ip['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()
        
    if member:      
        display(HTML("<h3>Evolution of pageviews and revisions by members</h3>" ))
        df_members = compute_sum_ts_yearly(dataset_dir_name,list_of_page_names,get_pages_members_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)

        df_members['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_members['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()

    if bot:
        display(HTML("<h3>Evolution of pageviews and revisions by bot</h2>" ))
        df_bot = compute_sum_ts_yearly(dataset_dir_name,list_of_page_names,get_pages_bot_revisions_time_series,'revisions').join(df_pageviews,how='outer').fillna(0)

        df_bot['revisions'].plot(figsize=(11,4), linewidth="0.5", ylim=0, colormap="Spectral", rot=0)
        df_bot['page_views'].plot(secondary_y=True, style="-", linewidth="0.5", ylim=0,sharex=True)
        plt.show()
