#Data Preparation

In [1]:
# -*- coding: utf-8 -*-
%matplotlib inline
%config InlineBackend.figure_formats=['svg']

import json
import codecs
import os
import math

import numpy as np
import seaborn as sns
import datetime

import pandas as pd
import datetime as dt

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import hashlib

from bs4 import BeautifulSoup

from collections import Counter
import networkx as nx
from networkx.algorithms import bipartite 

from wekeypedia.wikipedia.page import WikipediaPage as Page

#Dataset directory

To choose the dataset configure the variable 'dataset_dir'. The directory must contain a file named 'pagenames', which contains one title on each line. 

In [2]:
dataset_dir_name = 'listgeometry'
print_v = True
print_t = True

In [3]:
file_page_names = "%s/pagenames" % dataset_dir_name
list_of_page_names = [x.strip() for x in codecs.open(file_page_names,"r","utf-8").readlines()]

if (print_t): print 'Number of pages in file:',len(list_of_page_names) 

Number of pages in file: 299


#Basic statistics computation
##Data gathering

The following functions can be used to download and store the data in files.

In [None]:
def get_path_to_file(name):
    hashmd5 = hashlib.md5(name.encode('utf-8')).hexdigest()
    path = '/'+hashmd5[0]+'/'+hashmd5[1]+'/'+hashmd5[2]+'/'+hashmd5[3]+'/'
    return path,hashmd5

def load_pages_data(dataset_dir_name, list_of_page_names):
    data_pages_dir_name = '%s/data/pages/' % dataset_dir_name
    if not(os.path.exists(data_pages_dir_name)): os.makedirs(data_pages_dir_name)

    pages_data = {}
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        if not(os.path.exists( '%s/%s' % (data_pages_dir_name, path) )):
            os.makedirs( '%s/%s' % (data_pages_dir_name, path) )
        file_name = '%s/%s/%s.json'%(data_pages_dir_name, path, hashmd5)
        if (os.path.exists(file_name)):
            with open(file_name) as f:
                pages_data[page_name] = json.load(f)
        else:
            pass
    return pages_data


def gather_pages_data(dataset_dir_name, list_of_page_names):
    data_pages_dir_name = '%s/data/pages/' % dataset_dir_name
    if not(os.path.exists(data_pages_dir_name)): os.makedirs(data_pages_dir_name)

    pages_data = {}
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        if not(os.path.exists( '%s/%s' % (data_pages_dir_name, path) )):
            os.makedirs( '%s/%s' % (data_pages_dir_name, path) )
        file_name = '%s/%s/%s.json'%(data_pages_dir_name, path, hashmd5)
        if (os.path.exists(file_name)):
            with open(file_name) as f:
                pages_data[page_name] = json.load(f)
        else:
            data = {}
            wikipage = Page(title=page_name)
            request = wikipage.fetch_info(page_name)['query']['pages']
            page_id = list(request)[0]
            if page_id!='-1':
                try:
                    for x in request[page_id]:
                        data[x]=request[page_id][x]
                    data['revisions']=wikipage.get_revisions_list()
                    data['links']= wikipage.get_links()
                    data['categories']= wikipage.get_categories()
                    pages_data[page_name]=data
                    f = open(file_name,'w')
                    f.write(json.dumps(data))
                    f.close()
                except Exception as e:
                    print 'Error with page:',page_name
                    print e
    return(pages_data)

## ## ## ## ## ## ## ## ## ##

data_pages_dir_name='%s/data/pages/'% (dataset_dir_name)
if not(os.path.exists(data_pages_dir_name)): os.mkdir(data_pages_dir_name)

pages_data = load_pages_data(list_of_page_names, data_pages_dir_name)
if (print_t): print 'Number of pages load:',len(pages_data)

if len (list_of_page_names) != len(pages_data.keys()):
    if (print_t): print 'Failure to load:'
    for fail_page in list_of_page_names:
        if fail_page not in pages_data.keys():
            if (print_t): print '  >',fail_page
    if (print_t): print 'Updating list of page names without them'
    list_of_page_names = pages_data.keys()

if (print_v): print '> Pages load in: pages_data'
    
#talk_pages_data=load_pages_data([''.join(['Talk:',x]) for x in list_of_page_names],data_pages_dir_name)  
#print 'Number of talk pages load:',len(talk_pages_data)
    

##Statistics computation

In [36]:
def stat_computation(pages_data):
    df = pd.DataFrame(index=pages_data.keys())
    
    #pageid
    data={k:v['pageid'] for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Page_id'] = data[k]
    #length
    data={k:v['length'] for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Length'] = data[k]
    #namespace
    data={k:v['ns'] for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Namespace'] = data[k]
    #nombre de revisions
    data={k:len(v['revisions']) for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions'] = data[k]
    #nombre de revisions by IP
    data={k:len([x for x in v['revisions'] if ('userid' in x and x['userid']==0)])
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions_IP'] = data[k]
    #nombre de revisions by Bot
    data={k:len([x for x in v['revisions'] if ('user' in x and 'bot' in x['user'].lower())])
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions_Bot'] = data[k]
    #nombre de revisions by Alive Registered Members
    data={k:len([x for x in v['revisions'] if ('user' in x and x['userid']!=0 and 'bot' not in x['user'].lower())])
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_revisions_wiki'] = data[k]
    #nombre de contributeurs
    data={k:len(set([x['user'] for x in v['revisions'] if 'user' in x]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors'] = data[k]
    #nombre de contributeurs IP
    data={k:len(set([x['user'] for x in v['revisions'] if 'userid' in x and x['userid']==0]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors_IP'] = data[k]
    #nombre de contributeurs Bot
    data={k:len(set([x['user'] for x in v['revisions'] if 'user' in x and 'bot' in x['user']]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors_Bot'] = data[k]
    #nombre de contributeurs by Alive Registered Members
    data={k:len(set([x['user'] for x in v['revisions'] if 'user' in x and x['userid']!=0 and 'bot' not in x['user']]))
            for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Nb_editors_wiki'] = data[k]
    #nombre de revisions
    data={k:len(v['links']) for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Links'] = data[k]
    #date of the first contibutions (in number of days after the start of the wikipedia project)
    def numberOfDaysAfter(date):
        return( (datetime.datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ")-datetime.datetime.strptime("2001-01-15T00:00:00Z","%Y-%m-%dT%H:%M:%SZ")).days)
    data={k:min(map(numberOfDaysAfter,map(lambda x: x['timestamp'],v['revisions'])))  for k,v in pages_data.items()}
    for k in pages_data.keys(): df.ix[k,'Date'] = data[k]

    return(df)

## ## ## ## ## ## ## ## ## ##

stats_dir_name = "%s/stats/" % dataset_dir_name
if not(os.path.exists(stats_dir_name)): os.mkdir(stats_dir_name)

basic_stats_file_name = '%s/basic_stats.csv' % stats_dir_name

df_basic_stats = pd.DataFrame()

if ( os.path.exists(basic_stats_file_name) ):
    df_basic_stats = df_basic_stats.from_csv(basic_stats_file_name, encoding="utf-8")
else:
    df_basic_stats = stat_computation(pages_data)
    df_basic_stats.to_csv(basic_stats_file_name, encoding="utf-8")

if (print_v): print '> Dataframe of basic stat: df_basic_stats'
df_basic_stats.head(5)

> Dataframe of basic stat: df_basic_stats


Unnamed: 0,Page_id,Length,Namespace,Nb_revisions,Nb_revisions_IP,Nb_revisions_Bot,Nb_revisions_wiki,Nb_editors,Nb_editors_IP,Nb_editors_Bot,Nb_editors_wiki,Links,Date
Digital geometry,386413,7211,0,116,51,11,54,63,16,7,40,46,1052
Synthetic geometry,267484,11870,0,129,14,11,104,60,11,5,44,98,910
Triangle inequality,53941,25011,0,395,122,28,245,199,88,12,99,87,498
Deltahedron,493995,13811,0,197,18,19,160,69,13,5,51,70,1139
Isoperimetric inequality,326182,19249,0,176,28,27,121,94,21,12,61,105,982


# Statistique on content of the last revision

## Gathering last revision content

In [37]:
def load_pages_last_revision_text(list_of_page_names, text_dir_name):
    last_revision_text = {}
    
    for page_name in list_of_page_names:
        file_name = '%s/%s.html' % (text_dir_name,page_name)
        if (os.path.exists(file_name)):
            last_revision_text[page_name] = ''.join(codecs.open(file_name,"r", "utf-8-sig").readlines())
        else:
            data = {}
            wikipage = Page(title=page_name)
            last_revision_text[page_name] = wikipage.get_current()
            f = open(file_name,'w')
            f.write(last_revision_text[page_name].encode('utf-8'))
            f.close()
                
    return(last_revision_text)

## ## ## ## ## ## ## ## ## ##

text_dir_name = '%s/data/text/' % dataset_dir_name
if not(os.path.exists(text_dir_name)): os.mkdir(text_dir_name)

pages_last_revision_text = load_pages_last_revision_text(list_of_page_names, text_dir_name)

if (print_t): print 'Text of the last revision of ', len(pages_data),'pages load.'
if (print_v): print '> Text of pages store in: pages_last_revision_text'

> Text of pages store in: pages_last_revision_text


## Computation of statistics on content

In [38]:
def basic_word_analysis(pages_text_last_revision):
    df = pd.DataFrame(index=pages_text_last_revision.keys())

    for page_name in pages_text_last_revision:
        text = BeautifulSoup( pages_text_last_revision[page_name] ).text
        words = text.split(" ")
        df.ix[page_name,'Nb_words'] = len(words)
        df.ix[page_name,'Average_word_length'] = sum([len(x) for x in words])/float(len(words))   

    return df

## ## ## ## ## ## ## ## ## ##

words_stats_file_name = '%s/words_stats.csv' % stats_dir_name

df_word_stats = pd.DataFrame()

if os.path.exists(words_stats_file_name):
    df_word_stats = df_word_stats.from_csv(words_stats_file_name, encoding="utf-8")
else:
    df_word_stats = basic_word_analysis(pages_last_revision_text)
    df_word_stats.to_csv(words_stats_file_name, encoding="utf-8")

df_word_stats.head(5)

if (print_v): print '> Dataframe of word basic stat: df_word_stats'


> Dataframe of word basic stat: df_word_stats


#Pageviews time series

##Gather pages views

In [39]:
def gather_pages_views(list_of_page_names, pages_views_dir_name):
    error = []
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        file_name = '%s/%s/%s.json' % (pages_views_dir_name,path, hashmd5)
        if not (os.path.exists(file_name)):            
            try:
                wikipage = Page(title=page_name)
                page_views_ts = {k:v for x in wikipage.get_pageviews() for (k,v) in x.items()}
                f = open(filename,'w')
                f.write(json.dumps(page_views_ts))
                f.close()
            except:
                error.append(page_name)
    return(error)

## ## ## ## ## ## ## ## ## ##

pages_views_dir_name = '%s/data/pagesviews/' % dataset_dir_name
if not(os.path.exists(pages_views_dir_name)): os.mkdir(pages_views_dir_name)

error = gather_pages_views(list_of_page_names, pages_views_dir_name)
while len(error)>0:
    error = gather_pages_views(error, pages_views_dir_name)

#print '!'

##Compute pages views time series

In [40]:
def get_pages_views_time_series(list_of_page_names, pages_views_dir_name, time_series_dir_name):
    pages_views_daily_ts={}
    pages_views_weekly_ts={}       
    pages_views_monthly_ts={}
    for page_name in list_of_page_names:
        path,hashmd5 = get_path_to_file(page_name)
        file_name = '%s/%s/%s.json' % (pages_views_dir_name,path, hashmd5)
        if not(os.path.exists( '%s/%s' % (time_series_dir_name,path) )):
            os.makedirs( '%s/%s' % (time_series_dir_name,path) )
        file_name_daily = '%s/%s/%s.pageviews.daily.csv' % (time_series_dir_name,path,page_name)
        file_name_weekly = '%s/%s/%s.pageviews.weekly.csv' % (time_series_dir_name,path,page_name)
        file_name_monthly = '%s/%s/%s.pageviews.monthly.csv' % (time_series_dir_name,path,page_name)
        if os.path.exists(file_name):
            if (os.path.exists(file_name_daily)):
                pages_views_daily_ts[page_name] = pd.DataFrame().from_csv(file_name_daily)
                pages_views_weekly_ts[page_name] = pd.DataFrame().from_csv(file_name_weekly)
                pages_views_monthly_ts[page_name] = pd.DataFrame().from_csv(file_name_monthly)
            else:
                data=json.load(open(file_name)).items()
                index = []
                series = []
                for k,v in data:
                    try:
                        index.append(pd.to_datetime(k, format="%Y-%m-%d"))
                        series.append(v)
                    except ValueError:
                        continue
                df = pd.DataFrame(series,index=index,columns=['page_views'])
                pages_views_daily_ts[page_name]=df
                pages_views_daily_ts[page_name].to_csv(file_name_daily,encoding="utf-8")
                pages_views_weekly_ts[page_name] = df.resample('W-MON', how='sum')
                pages_views_weekly_ts[page_name].to_csv(file_name_weekly,encoding="utf-8")
                pages_views_monthly_ts[page_name] = df.resample('M', how='sum')
                pages_views_monthly_ts[page_name].to_csv(file_name_monthly,encoding="utf-8")
    return(pages_views_daily_ts,pages_views_weekly_ts,pages_views_monthly_ts)

## ## ## ## ## ## ## ## ## ##
    
time_series_dir_name = '%s/stats/time series/' % dataset_dir_name
if not(os.path.exists(time_series_dir_name)): os.mkdir(time_series_dir_name)
    
pages_views_daily_ts,pages_views_weekly_ts,pages_views_monthly_ts = get_pages_views_time_series(list_of_page_names, pages_views_dir_name, time_series_dir_name)

if (print_t): print 'Number of daily pageviews time series:',len(pages_views_daily_ts)
if (print_v): print '> Dic of dataframes of daily pageviews time series: pages_views_daily_ts'

if (print_t): print 'Number of weekly pageviews time series:',len(pages_views_weekly_ts)
if (print_v): print '> Dic of dataframes of weekly pageviews time series: pages_views_weekly_ts'

if (print_t): print 'Number of monthly pageviews time series:',len(pages_views_monthly_ts)
if (print_v): print '> Dic of dataframes of monthly pageviews time series: pages_views_monthly_ts'


> Dic of dataframes of daily pageviews time series: pages_views_daily_ts
> Dic of dataframes of weekly pageviews time series: pages_views_weekly_ts
> Dic of dataframes of monthly pageviews time series: pages_views_monthly_ts


#Revisions time series

In [41]:
def get_pages_revisions_time_series_gen(pages_revisions,suffixe,time_series_dir_name):
    revisions_daily_ts={}
    revisions_weekly_ts={}       
    revisions_monthly_ts={}
    for page_name in pages_revisions.keys():
        path,hashmd5 = get_path_to_file(page_name)
        file_name = '%s/%s/%s.json' % (pages_views_dir_name,path, hashmd5)
        if not(os.path.exists( '%s/%s' % (time_series_dir_name,path) )):
            os.makedirs( '%s/%s' % (time_series_dir_name,path) )
        file_name_daily = '%s/%s/%s.%s.daily.csv'%(time_series_dir_name,path,suffixe,page_name)
        file_name_weekly = '%s/%s/%s.%s.weekly.csv'%(time_series_dir_name,path,suffixe,page_name)
        file_name_monthly = '%s/%s/%s.%s.monthly.csv'%(time_series_dir_name,path,suffixe,page_name)
        if (os.path.exists(file_name_daily)):
            revisions_daily_ts[page_name] = pd.DataFrame().from_csv(file_name_daily)
            revisions_weekly_ts[page_name] = pd.DataFrame().from_csv(file_name_weekly)
            revisions_monthly_ts[page_name] = pd.DataFrame().from_csv(file_name_monthly)
        else:
            revisions=pages_revisions[page_name]
            
            if len(revisions)==0:
                #print 'No revisions for %s with suffixe: %s' % (page_name,suffixe)
                continue
            
            df = pd.DataFrame(revisions)
            df["datetime"] = df.timestamp.apply(pd.to_datetime)
            df["day"] = df.datetime.apply(dt.date.strftime, args=('%Y-%m-%d',))
            counts = df.groupby(df["day"])
            counts = counts.aggregate(len)
            series = counts["size"].tolist()
            index = counts.index.map(lambda x: pd.to_datetime(x, format="%Y-%m-%d"))
            df = pd.DataFrame(series,columns=['revisions'],index=index)
            
            revisions_daily_ts[page_name]=df
            revisions_daily_ts[page_name]=revisions_daily_ts[page_name].fillna(0)
            revisions_daily_ts[page_name].to_csv(file_name_daily,encoding="utf-8")
            
            revisions_weekly_ts[page_name]=df.resample('W-MON', how='sum')
            revisions_weekly_ts[page_name]=revisions_weekly_ts[page_name].fillna(0)           
            revisions_weekly_ts[page_name].to_csv(file_name_weekly,encoding="utf-8")

            revisions_monthly_ts[page_name]=df.resample('M', how='sum')
            revisions_monthly_ts[page_name]=revisions_monthly_ts[page_name].fillna(0)
            revisions_monthly_ts[page_name].to_csv(file_name_monthly,encoding="utf-8")
    
    return(revisions_daily_ts,revisions_weekly_ts,revisions_monthly_ts)


## ## ## ## ## ## ## ## ## ##


def get_pages_revisions_time_series(pages_data,time_series_dir_name):
    data = {}
    for k,v in pages_data.iteritems():
        data[k]=v['revisions']
    return get_pages_revisions_time_series_gen(data,'revisions',time_series_dir_name)

def get_pages_ip_revisions_time_series(pages_data,time_series_dir_name):
    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('userid' in x and x['userid']==0)]
    return get_pages_revisions_time_series_gen(data,'revisions.ip',time_series_dir_name)

def get_pages_bot_revisions_time_series(pages_data,time_series_dir_name):
    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('user' in x and 'bot' in x['user'].lower())]
    return get_pages_revisions_time_series_gen(data,'revisions.bot',time_series_dir_name)

def get_pages_members_revisions_time_series(pages_data,time_series_dir_name):
    data = {}
    for k,v in pages_data.iteritems():
        data[k]=[x for x in v['revisions'] if ('user' in x and x['userid']!=0 and 'bot' not in x['user'].lower())]
    return get_pages_revisions_time_series_gen(data,'revisions.members',time_series_dir_name)
    
    
## ## ## ## ## ## ## ## ## ##


revisions_daily_ts,revisions_weekly_ts,revisions_monthly_ts = get_pages_revisions_time_series(pages_data,time_series_dir_name)
revisions_ip_daily_ts,revisions_ip_weekly_ts,revisions_ip_monthly_ts = get_pages_ip_revisions_time_series(pages_data,time_series_dir_name)
revisions_bot_daily_ts,revisions_bot_weekly_ts,revisions_bot_monthly_ts = get_pages_bot_revisions_time_series(pages_data,time_series_dir_name)
revisions_members_daily_ts,revisions_members_weekly_ts,revisions_members_monthly_ts = get_pages_members_revisions_time_series(pages_data,time_series_dir_name)

#revisions_daily_ts,revisions_weekly_ts,revisions_monthly_ts = get_pages_revisions_time_series(pages_data,'revisions',time_series_dir_name)
#print pages_data[pages_data.keys()[0]]['revisions']
## store the content of the last revisions also in pages_data
#for page_name in pages_data:
#    pages_data[page_name]['revisions_daily_ts']=revisions_daily_ts[page_name]
#    pages_data[page_name]['revisions_weekly_ts']=revisions_weekly_ts[page_name]
#    pages_data[page_name]['revisions_monthly_ts']=revisions_monthly_ts[page_name]


if (print_t): print 'Number of revisions time series of page load:',len(revisions_daily_ts)
if (print_v): print '> Dic of dataframes of daily revisions time series: revisions_daily_ts'
if (print_t): print 'Number of weekly pageviews time series:',len(revisions_weekly_ts)
if (print_v): print '> Dic of dataframes of weekly revisions time series: revisions_weekly_ts'
if (print_t): print 'Number of monthly pageviews time series:',len(revisions_monthly_ts)
if (print_v): print '> Dic of dataframes of monthly revisions time series: revisions_monthly_ts'

if (print_t): print 'Number of IP revisions time series of page load:',len(revisions_ip_daily_ts)
if (print_v): print '> Dic of dataframes of daily IP revisions time series: revisions_ip_daily_ts'
if (print_t): print 'Number of IP weekly pageviews time series:',len(revisions_ip_weekly_ts)
if (print_v): print '> Dic of dataframes of weekly IP revisions time series: revisions_ip_weekly_ts'
if (print_t): print 'Number of IP monthly pageviews time series:',len(revisions_ip_monthly_ts)
if (print_v): print '> Dic of dataframes of monthly IP revisions time series: revisions_ip_monthly_ts'

if (print_t): print 'Number of bot revisions time series of page load:',len(revisions_bot_daily_ts)
if (print_v): print '> Dic of dataframes of daily bot revisions time series: revisions_bot_daily_ts'
if (print_t): print 'Number of bot weekly pageviews time series:',len(revisions_bot_weekly_ts)
if (print_v): print '> Dic of dataframes of weekly bot revisions time series: revisions_bot_weekly_ts'
if (print_t): print 'Number of bot monthly pageviews time series:',len(revisions_bot_monthly_ts)
if (print_v): print '> Dic of dataframes of monthly bot revisions time series: revisions_bot_monthly_ts'

if (print_t): print 'Number of members revisions time series of page load:',len(revisions_members_daily_ts)
if (print_v): print '> Dic of dataframes of daily members revisions time series: revisions_members_daily_ts'
if (print_t): print 'Number of members weekly pageviews time series:',len(revisions_members_weekly_ts)
if (print_v): print '> Dic of dataframes of weekly members revisions time series: revisions_members_weekly_ts'
if (print_t): print 'Number of members monthly pageviews time series:',len(revisions_members_monthly_ts)
if (print_v): print '> Dic of dataframes of monthly members revisions time series: revisions_members_monthly_ts'


> Dic of dataframes of daily revisions time series: revisions_daily_ts
> Dic of dataframes of weekly revisions time series: revisions_weekly_ts
> Dic of dataframes of monthly revisions time series: revisions_monthly_ts
> Dic of dataframes of daily IP revisions time series: revisions_ip_daily_ts
> Dic of dataframes of weekly IP revisions time series: revisions_ip_weekly_ts
> Dic of dataframes of monthly IP revisions time series: revisions_ip_monthly_ts
> Dic of dataframes of daily bot revisions time series: revisions_bot_daily_ts
> Dic of dataframes of weekly bot revisions time series: revisions_bot_weekly_ts
> Dic of dataframes of monthly bot revisions time series: revisions_bot_monthly_ts
> Dic of dataframes of daily members revisions time series: revisions_members_daily_ts
> Dic of dataframes of weekly members revisions time series: revisions_members_weekly_ts
> Dic of dataframes of monthly members revisions time series: revisions_members_monthly_ts


#Graph Construction


# Graph of occurences computation

In [42]:
# Add an occurence of the title in the content of the page
def compute_occurences_graph(pages_data, pages_last_revision_text):
    occurences_graph = nx.Graph()
    
    for page_name in pages_data:
        occurences_graph.add_node(page_name)
    
    for page_name in pages_data.keys():
        intradomain =  sorted( pages_data.keys(), key=lambda k: -len(k) )
        intradomain.remove(page_name)
        gruyere = BeautifulSoup(pages_last_revision_text[page_name]).text.lower()
        occurences_name = {}
        for occu in intradomain:
            occurences_name[occu] = unicode(gruyere).count(unicode(occu.lower()))
            gruyere = gruyere.replace(occu, "")
        
        #print occurences_name
        ave_occurences_name = sum(occurences_name.values())/float(len(occurences_name))

        occurences_name = {k:v for k,v in occurences_name.items() if v>ave_occurences_name}
     
        for occu in occurences_name:
            occurences_graph.add_edge(page_name,occu,attr_dict={
                    'distance':1/float(1+occurences_name[occu]),
                    'weight': occurences_name[occu]})
    
    return(occurences_graph)

## ## ## ## ## ## ## ## ## ##

graph_dir_name = '%s/graph/' % (dataset_dir_name)
if not(os.path.exists(graph_dir_name)): os.mkdir(graph_dir_name)

file_occurences_graph_name = '%s/occurences_graph.gexf' % (graph_dir_name)
occurences_graph = nx.Graph()

if (os.path.exists(file_occurences_graph_name)):
    occurences_graph = nx.read_gexf(file_occurences_graph_name)
else:
    occurences_graph = compute_occurences_graph(pages_data,pages_last_revision_text)
    nx.write_gexf(occurences_graph, file_occurences_graph_name)

if (print_t): print 'Graph of occurences:'
if (print_t): print '  number of nodes:',len(occurences_graph.nodes())
if (print_t): print '  number of edges:',len(occurences_graph.edges())

if (print_v): print '> graph of occurences store in: occurences_graph'
    


> graph of occurences store in: occurences_graph


## Graph of links computation

In [43]:
# Add an occurence of the title in the content of the page
def compute_links_graph(pages_data, pages_last_revision_text):
    links_graph = nx.Graph()
    
    for page_name in pages_data:
        links_graph.add_node(page_name)
    
    for page_name in pages_data.keys():
        links = Counter( [x for x in pages_data[page_name]['links'] if x in pages_data.keys()] )                 
     
        for l in links:
            links_graph.add_edge(page_name,l,attr_dict={
                    'distance':1/float(1+links[l]),
                    'weight': links[l]})
    
    return(links_graph)

## ## ## ## ## ## ## ## ## ##

graph_dir_name = '%s/graph/' % (dataset_dir_name)
if not(os.path.exists(graph_dir_name)): os.mkdir(graph_dir_name)

file_links_graph_name = '%s/links_graph.gexf' % (graph_dir_name)
links_graph = nx.Graph()

if (os.path.exists(file_links_graph_name)):
    links_graph = nx.read_gexf(file_links_graph_name)
else:
    links_graph = compute_links_graph(pages_data,pages_last_revision_text)
    nx.write_gexf(links_graph, file_links_graph_name)

if (print_t): print 'Graph of links:'
if (print_t): print '  number of nodes:',len(links_graph.nodes())
if (print_t): print '  number of edges:',len(links_graph.edges())

if (print_v): print '> graph of links store in: links_graph'

#components = nx.connected_components(links_graph)
#for con in components:
#    print 'Size:',len(con)
#    print con

> graph of links store in: links_graph


## Page/editors bipartite graph computation

In [44]:
def compute_pages_editors_bipartite_graph(pages_data):
    pages_editors_bipartite_graph = nx.Graph()
    editors_all = {}
    for page_name in pages_data:
        editors = Counter([x['user'] for x in pages_data[page_name]['revisions'] 
                           if ('user' in x) and (x['userid']!=0) and ('bot' not in x['user'].lower())])
        pages_editors_bipartite_graph.add_node(page_name, type='page')
        pages_editors_bipartite_graph.node[page_name]["revisions"]=len(pages_data[page_name]['revisions'])
        pages_editors_bipartite_graph.node[page_name]["editors"]=len(editors)
        
        for e in editors:
            if e not in editors_all: 
                pages_editors_bipartite_graph.add_node(''.join(['editor:',e]), type="editor")
                editors_all[e]=editors[e]
            else:
                editors_all[e]+=editors[e]
            pages_editors_bipartite_graph.add_edge(''.join(['editor:',e]), page_name, 
                                         attr_dict={'revisions':editors[e]})
    #add number of revision on editor node
    for e in editors_all:
         pages_editors_bipartite_graph.node[''.join(['editor:',e])]["revisions"]=editors_all[e]
    return(pages_editors_bipartite_graph)

## ## ## ## ## ## ## ## ## ##

file_pages_editors_bipartite_graph_name = '%s/pages_editors_bipartite_graph.gexf' % (graph_dir_name)
pages_editors_bipartite_graph = nx.Graph()

if (os.path.exists(file_pages_editors_bipartite_graph_name)):
    pages_editors_bipartite_graph = nx.read_gexf(file_pages_editors_bipartite_graph_name)
else:
    pages_editors_bipartite_graph =compute_pages_editors_bipartite_graph(pages_data)
    nx.write_gexf(pages_editors_bipartite_graph, file_pages_editors_bipartite_graph_name) 

if (print_t): print 'Bipartite graph page/editors'
if (print_t): print '  number of nodes:',len(pages_editors_bipartite_graph.nodes())
if (print_t): print '  number of edges:',len(pages_editors_bipartite_graph.edges())

if (print_v): print '> bipartite graph page/editors store in: pages_editors_bipartite_graph'


> bipartite graph page/editors store in: pages_editors_bipartite_graph


## Projection graphs

In [45]:
def compute_projected_graph_page(pages_editors_bipartite_graph):
    selected = [x[0] for x in pages_editors_bipartite_graph.nodes(data=True) if x[1]['type']=='page']
    res=bipartite.projected_graph(pages_editors_bipartite_graph, selected)
    for p1 in res.nodes():    
        for p2 in res[p1]:
            coeditors = set(pages_editors_bipartite_graph[p1]) & set(pages_editors_bipartite_graph[p2])             
            editors = pages_editors_bipartite_graph.node[p1]['editors']+pages_editors_bipartite_graph.node[p2]['editors']
            res[p1][p2]['weight'] = len(coeditors) 
            res[p1][p2]['distance'] = 1/float(1+len(coeditors))
            
    return(res)

## ## ## ## ## ## ## ## ## ##

file_projected_graph_page_name = '%s/projected_graph_page.gexf' % (graph_dir_name)
projected_graph_page = nx.Graph()

if (os.path.exists(file_projected_graph_page_name)):
    projected_graph_page = nx.read_gexf(file_projected_graph_page_name)
else:
    projected_graph_page = compute_projected_graph_page(pages_editors_bipartite_graph)
    nx.write_gexf(projected_graph_page, file_projected_graph_page_name)   

if (print_t): print 'Projection graph on pages'
if (print_t): print '  number of nodes:',len(projected_graph_page.nodes())
if (print_t): print '  number of edges:',len(projected_graph_page.edges())
    
if (print_v): print '> projected graph page store in: projected_graph_page'

#components = nx.connected_components(projected_graph_page)
#for con in components:
#    print 'Size:',len(con)
#    print con

> projected graph page store in: projected_graph_page


## Statistics on graph    

In [46]:
def compute_graph_statistics_on_nodes(graph,weight=None):
    df = pd.DataFrame(index=graph.nodes())
    #Degree centrality
    try:
        data=nx.degree_centrality(graph)
        for k in graph.nodes(): df.ix[k,'Degree Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute degree centrality.'
        print 'Error:',e
    #Closeness centrality
    try:
        data=nx.closeness_centrality(graph)
        for k in graph.nodes(): df.ix[k,'Closeness Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute closeness centrality.'
        print 'Error:',e
    #Betweenness centrality
    try:
        data=nx.betweenness_centrality(graph, weight=weight)
        for k in graph.nodes(): df.ix[k,'Betweenness Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute betweenness centrality.'
        print 'Error:',e
    #Current flow closeness centrality
    try:
        data=nx.current_flow_closeness_centrality(graph, weight=weight)
        for k in graph.nodes(): df.ix[k,'Flow Clos Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute flow closeness centrality.'
        print 'Error:',e
    #Current flow betweenness centrality
    try:
        data=nx.current_flow_betweenness_centrality(graph, weight=weight)
        for k in graph.nodes(): df.ix[k,'Cur Flow Clos Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute current flow closeness centrality.'
        print 'Error:',e
    #Pagerank
    try:
        data=nx.pagerank(graph,weight=weight)
        for k in graph.nodes(): df.ix[k,'Pagerank Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute pagerank.'
        print 'Error:',e
    #Eigenvector centrality
    try:
        data=nx.eigenvector_centrality_numpy(graph, weight=weight)
        for k in graph.nodes(): df.ix[k,'Eigen Cen'] = data[k]
    except Exception as e:
        print 'Unable to compute eigenvector centrality.'
        print 'Error:',e
    
    return df
 
def compute_graph_statistics_on_pair_of_nodes(graph,weight=None):
    df = pd.DataFrame(index=[(x,y) for x in graph.nodes() for y in graph.nodes()])
    #Communicability
    try:
        data=nx.communicability(graph)
        for k in graph.nodes():
            for v in graph.nodes():
                df.ix[(k,v),"Communicability"] = data[k][v]
    except Exception as e:
        print 'Unable to compute communicability.'
        print 'Error:',e
    #Shortest path
    try:
        data=nx.shortest_path_length(graph,weight=weight)
        for k in graph.nodes():
            for v in graph.nodes():
                try:
                    df.ix[(k,v),"Shortest path"] = data[k][v]
                except Exception as e:
                    print 'error saving...',e
                    print data[k][v]
    except Exception as e:
        print 'Unable to compute shortest path length..'
        print 'Error:',e
     
    return(df)

def compute_graph_statistics(graph,stats_dir_name,prefix_file):
    data_frame_node = pd.DataFrame()
    filename = "%s/%s-nodes-stats.csv" % (stats_dir_name,prefix_file)
    if (os.path.exists(filename)):
        data_frame_node = data_frame_node.from_csv(filename)
    else:
        data_frame_node = compute_graph_statistics_on_nodes(graph,weight='weight')
        data_frame_node.to_csv(filename,encoding="utf-8")
    
    data_frame_pair = pd.DataFrame()
    filename = "%s/%s-pair-stats.csv" % (stats_dir_name,prefix_file)
    if (os.path.exists(filename)):
        data_frame_pair = data_frame_pair.from_csv(filename)
    else:
        data_frame_pair = compute_graph_statistics_on_pair_of_nodes(graph,weight='distance')
        data_frame_pair.to_csv(filename,encoding="utf-8")
    
    return(data_frame_node,data_frame_pair)


## ## ## ## ## ## ## ## ## ## ##

if (print_t): print 'Computation of statistics on projected graph page.'
df_pro_graph_page_node,df_pro_graph_page_pair = compute_graph_statistics(projected_graph_page,stats_dir_name,'projected_graph_page')
  
if (print_v): 
    print '> dataframe of statistics on projected graph page node: df_pro_graph_page_node'
    print '> dataframe of statistics on projected graph page pair of nodes: df_pro_graph_page_pair'
    

if (print_t): print 'Computation of statistics on links graph.'
df_links_graph_node,df_links_graph_pair = compute_graph_statistics(links_graph,stats_dir_name,'links_graph')

if (print_v): 
    print '> dataframe of statistics on links graph node: df_links_graph_node'
    print '> dataframe of statistics on links graph pair of nodes: df_links_graph_pair'


if (print_t): print 'Computation of statistics on occurences graph.'
df_occurences_graph_node,df_occurences_graph_pair = compute_graph_statistics(occurences_graph,stats_dir_name,'occurences_graph')

if (print_v): 
    print '> dataframe of statistics on occurences graph node: df_occurences_graph_node'
    print '> dataframe of statistics on occurences graph pair of nodes: df_occurences_graph_pair'




> dataframe of statistics on projected graph page node: df_pro_graph_page_node
> dataframe of statistics on projected graph page pair of nodes: df_pro_graph_page_pair
> dataframe of statistics on links graph node: df_links_graph_node
> dataframe of statistics on links graph pair of nodes: df_links_graph_pair
done computation
> dataframe of statistics on occurences graph node: df_occurences_graph_node
> dataframe of statistics on occurences graph pair of nodes: df_occurences_graph_pair


In [47]:
#print len(links_graph.nodes())

#res = nx.shortest_path_length(links_graph,weight='distance')
#for so in res:
#    print len(res[so])
    
    

#Intersection graph

Build a graph using links graph et projection: a edge between two page not far away in links graph et adjacent in projection 

In [48]:
#def intersection_graph():
#    global projected_graph_page, links_graph
#    global df_links_graph_pair
    
#    res = nx.Graph()
#    for p in links_graph.nodes():
        
    

# reading map based on a reduced graph of co-edited pages