#Data Preparation

In [2]:
import json
import codecs
import os
import math

import numpy as np

import pandas as pd
import datetime as dt

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from bs4 import BeautifulSoup

from wekeypedia.wikipedia.page import WikipediaPage as Page

basedir = 'listgeometry'

file_pagenames = "%s/pagenames" % basedir
pagenames =  codecs.open(file_pagenames,"r", "utf-8-sig").readlines()
pagenames = list(map(lambda x: x.strip(), pagenames))

print 'Number of pages:',len(pagenames) 

Number of pages: 303


#Basic statistics computation
##Data gathering

In [3]:
def load_pages(listofpages,data_dir):
    pages_data = {}
    for page in listofpages:
        if (os.path.exists('%s/%s.json'%(data_dir,page))):
            with open('%s/%s.json'%(data_dir,page)) as data:
                pages_data[page] = json.load(data)
        else:
            data = {}
            p = Page()
            req = p.fetch_info(page)['query']['pages']
            pageid = list(req)[0]
            if pageid!='-1':
                for x in req[pageid]:
                    data[x]=req[pageid][x]
                data['revisions']=p.get_revisions_list()
                data['links']= p.get_links_title()
                pages_data[page]=data
                f = open('%s/%s.json'% (data_dir,page),'w')
                f.write(json.dumps(data))
                f.close()
    return(pages_data)

#########
pages_dir='%s/data/pages/'% (basedir)
if not(os.path.exists(pages_dir)): os.mkdir(pages_dir)

pages_data=load_pages(pagenames,pages_dir)
talk_pages_data=load_pages(list(map(lambda x:''.join(['Talk:',x]),pagenames)),pages_dir)  

print 'Number of pages load:',len(pages_data)
print 'Number of talk pages load:',len(talk_pages_data)

Number of pages load: 303
Number of talk pages load: 298


##Statistics computation

In [4]:
def stat_computation(pages_data):
    df = pd.DataFrame(pages_data.keys(),columns=['title'])
    #pageid
    data={k:v['pageid'] for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Pageid']))
    #length
    data={k:v['length'] for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Length']))
    #ns
    data={k:v['ns'] for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','NS']))
    #nombre de revisions
    data={k:len(v['revisions']) for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb revisions']))
    #nombre de revisions by IP
    data={k:len( list(filter(lambda x: ('userid' in x) and (x['userid']==0),v['revisions'])) )  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb revisions IP']))
    #nombre de revisions by Bot
    data={k:len( list(filter(lambda x: ('userid' in x) and (x['userid']!=0) and ('bot' in x['user'].lower()),v['revisions'])) )  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb revisions bot']))
    #nombre de revisions by Alive Registered Members
    data={k:len( list(filter(lambda x: ('userid' in x) and (x['userid']!=0) and ('bot' not in x['user'].lower()),v['revisions'])) )  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb revisions wiki']))
    #nombre de contributeurs
    data={k:len(set(list(map(lambda x: x['user'],filter(lambda x:'user' in x,v['revisions'])))))  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb editos']))
    #nombre de contributeurs IP
    data={k:len(set(list(map(lambda x: x['user'],filter(lambda x:('userid' in x) and (x['userid']==0),v['revisions'])))))  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb editors IP']))
    #nombre de contributeurs Bot
    data={k:len(set(list(map(lambda x: x['user'],filter(lambda x:('userid' in x) and (x['userid']!=0) and ('bot' in x['user']),v['revisions'])))))  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb editors Bot']))
    #nombre de contributeurs by Alive Registered Members
    data={k:len(set(list(map(lambda x: x['user'],filter(lambda x:('userid' in x) and (x['userid']!=0) and ('bot' not in x['user']),v['revisions'])))))  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Nb editors Wiki']))
    #nombre de revisions
    data={k:len(v['links']) for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Links']))
    #date of the first contibutions (in number of days after the start of the wikipedia project)
    import datetime
    def numberOfDaysAfter(date):
        return( (datetime.datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ")-datetime.datetime.strptime("2001-01-15T00:00:00Z","%Y-%m-%dT%H:%M:%SZ")).days)
    data={k:min(map(numberOfDaysAfter,map(lambda x: x['timestamp'],v['revisions'])))  for k,v in pages_data.items()}
    df=df.merge(pd.DataFrame(data.items(),columns=['title','Date']))
    return(df)

########
stat_dir = "%s/stats/" % basedir
if not(os.path.exists(stat_dir)): os.mkdir(stat_dir)
basic_stats_file='basic_stats.csv'

df = pd.DataFrame()

if (os.path.exists("%s/%s" % (stat_dir,basic_stats_file))):
    df = df.from_csv("%s/%s" % (stat_dir,basic_stats_file),encoding="utf-8")
else:
    df = stat_computation(pages_data)
    df.to_csv("%s/%s" % (stat_dir,basic_stats_file),encoding="utf-8")

df.head(10)

Unnamed: 0,title,Pageid,Length,NS,Nb revisions,Nb revisions IP,Nb revisions bot,Nb revisions wiki,Nb editos,Nb editors IP,Nb editors Bot,Nb editors Wiki,Links,Date
0,Digital geometry,386413,7211,0,116,51,11,54,63,16,7,40,46,1052
1,Synthetic geometry,267484,11870,0,129,14,11,104,60,11,5,44,98,910
2,Triangle inequality,53941,25011,0,395,122,28,245,199,88,12,99,87,498
3,Deltahedron,493995,13811,0,197,18,19,160,69,13,5,51,70,1139
4,Isoperimetric inequality,326182,19249,0,176,28,27,121,94,21,12,61,105,982
5,Matrix representation of conic sections,189243,7787,0,110,31,9,70,62,16,3,43,31,773
6,Orthodiagonal quadrilateral,30425383,12496,0,71,3,5,63,16,3,3,10,57,3648
7,Invariant (mathematics),1126638,11436,0,147,25,18,104,77,20,8,49,127,1387
8,Information geometry,487312,31534,0,205,35,7,163,78,26,4,48,95,1136
9,Zonohedron,669402,14056,0,126,9,9,108,48,7,4,37,91,1222


# Statistique on content of the last revision

## Gathering last revision content

In [5]:
def load_content_last_revision(listofpages,data_dir):
    last_revision_data = {}
    for page in listofpages:
        if (os.path.exists('%s/%s.html'%(data_dir,page))):
            last_revision_data[page] = ''.join(codecs.open('%s/%s.html'%(data_dir,page),"r", "utf-8-sig").readlines())
        else:
            data = {}
            p = Page()
            req = p.fetch_info(page)['query']['pages']
            pageid = list(req)[0]  
            if pageid!='-1':
                last_revision_data[page] = p.get_current()[0]
                f = open('%s/%s.html'% (data_dir,page),'w')
                f.write(last_revision_data[page].encode('utf-8'))
                f.close()
                
    return(last_revision_data)

revision_dir = '%s/data/revisions/' %basedir
if not(os.path.exists(revision_dir)): os.mkdir(revision_dir)

last_revision_data = load_content_last_revision(pagenames,revision_dir)
## store the content of the last revisions also in pages_data
for k in last_revision_data:
    pages_data[k]['lastrevision']=last_revision_data[k]

print 'Number of revisions of page load:',len(pages_data)


Number of revisions of page load: 303


## Computation of statistics on content

In [6]:
def basic_word_analysis(pages_content):
    data =[]
    for k,content in pages_content.items():
        p={}
        p['title']=k
        text = BeautifulSoup(content).text
        p['nbwords'] = len(text)
        words = len(content.split(" "))
        p['average_word_length'] = float(p['nbwords'] - words)/float(words)   
        data.append(p)
    res = pd.DataFrame(data)
    if len(pages_content)!=0: res.set_index('title')
    return res

########
if not(os.path.exists(stat_dir)): os.mkdir(stat_dir)
basic_stats_file='words_stats.csv'

df = pd.DataFrame()

if (os.path.exists("%s/%s" % (stat_dir,basic_stats_file))):
    df = df.from_csv("%s/%s" % (stat_dir,basic_stats_file),encoding="utf-8")
else:
    df = basic_word_analysis(last_revision_data)
    df.to_csv("%s/%s" % (stat_dir,basic_stats_file),encoding="utf-8")

df.head(10) 

Unnamed: 0,average_word_length,nbwords,title
0,3.333072,5529,Digital geometry
1,4.07561,10405,Synthetic geometry
2,2.712475,16398,Triangle inequality
3,1.466334,6923,Deltahedron
4,3.38272,14406,Isoperimetric inequality
5,1.929956,4685,Matrix representation of conic sections
6,3.552224,9109,Orthodiagonal quadrilateral
7,3.444048,9968,Invariant (mathematics)
8,2.254628,17230,Information geometry
9,2.771525,8980,Zonohedron


#Graph Construction


## Graph of links computation

In [7]:
from collections import Counter
from networkx.algorithms import bipartite 
import networkx as nx

# Add an occurence of the title in the content of the page
def build_links_graph(data):
    global last_revision_data
    links_graph = nx.Graph()
    for page in data: links_graph.add_node(page)
    for page in data:
        links = data[page]['links']
        intradomain = set(links) & set(data.keys())
        gruyere = last_revision_data[page]
        linkssort = sorted(intradomain, key=lambda k: -len(k))
        for l in linkssort:
            occurences_link = len(filter(lambda x:x==l,links))
            occurences_named_entity = unicode(gruyere).count(unicode(k))
            gruyere = gruyere.replace(k, "")
            links_graph.add_edge(page,l,attr_dict={"link occurence": occurences_link, "term occurence": occurences_named_entity})
    return(links_graph)

#####
graph_dir = '%s/graph/' % (basedir)
if not(os.path.exists(graph_dir)): os.mkdir(graph_dir)

links_graph = nx.Graph()
if (os.path.exists("%s/links-graph.gexf" % (graph_dir))):
    links_graph = nx.read_gexf("%s/links-graph.gexf" % (graph_dir))
else:
    links_graph = build_links_graph(pages_data)
    nx.write_gexf(links_graph, "%s/links-graph.gexf" % (graph_dir))

print 'Graph of links:'
print '  number of nodes:',len(links_graph.nodes())
print '  number of edges:',len(links_graph.edges())


Graph of links:
  number of nodes: 303
  number of edges: 2975


## bipartite page-editors graph computation

In [8]:
def build_bipartite(data):
    pages_editors_graph = nx.Graph()
    editors_all = {}
    for p in data.keys():
        editors = Counter(list(map(lambda x:x['user'],filter(lambda x: ('user' in x) and (x['userid']!=0),data[p]['revisions']))))
        pages_editors_graph.add_node(''.join(['p:',p]), type='page')
        for e in editors:
            if e not in editors_all: 
                pages_editors_graph.add_node(''.join(['u:',e]), type="user")
                editors_all[e]=editors[e]
            else:
                editors_all[e]+=editors[e]
            pages_editors_graph.add_edge(''.join(['u:',e]), ''.join(['p:',p]), attr_dict={'revisions':editors[e]})
    #add number of revision on editor node
    for e in editors_all:
         pages_editors_graph.node[''.join(['u:',e])]["revisions"]=editors_all[e]
    return(pages_editors_graph)
        
pages_editors_graph = nx.Graph()
if (os.path.exists("%s/pages-editors-graph.gexf" % (stat_dir))):
    pages_editors_graph = nx.read_gexf("%s/pages-editors-graph.gexf" % (graph_dir))
else:
    pages_editors_graph = build_bipartite(pages_data)
    nx.write_gexf(pages_editors_graph, "%s/pages-editors-graph.gexf" % (graph_dir)) 

print 'Bipartite graph page/editors'
print '  number of nodes:',len(pages_editors_graph.nodes())
print '  number of edges:',len(pages_editors_graph.edges())


Bipartite graph page/editors
  number of nodes: 16206
  number of edges: 40044


## Projection graphs

In [9]:
from networkx.algorithms import bipartite 

def projected_graph(G,select, weight="weight"):
    selected = map(lambda x:x[0],filter(lambda x:x[1]['type']==select,G.nodes(data=True)))
    res=bipartite.projected_graph(G, selected)
    for u in res.nodes():
        for v in res[u].keys():
            w = len(set(G[u]) & set(G[v]))
            res[u][v][weight] = w
    return(res)

projected_graph_page = nx.Graph()
if (os.path.exists("%s/projected_graph_page.gexf" % (graph_dir))):
    projected_graph_page = nx.read_gexf("%s/projected_graph_page.gexf" % (graph_dir))
else:
    projected_graph_page = projected_graph(pages_editors_graph,'page','coeditors')
    nx.write_gexf(projected_graph_page, "%s/projected_graph_page.gexf" % (graph_dir))   

print 'Projection graph on pages'
print '  number of nodes:',len(projected_graph_page.nodes())
print '  number of edges:',len(projected_graph_page.edges())

#projected_graph_user = nx.Graph()
#if (os.path.exists("%s/projected_graph_user.gexf" % (graph_dir))):
#    projected_graph_user = nx.read_gexf("%s/projected_graph_user.gexf" % (graph_dir))
#else:
#    projected_graph_user = projected_graph(pages_editors_graph,'user')
#    nx.write_gexf(projected_graph_user, "%s/projected_graph_user.gexf" % (graph_dir))   

#print 'Projection graph on users'
#print '  number of nodes:',len(projected_graph_user.nodes())
#print '  number of edges:',len(projected_graph_user.edges())


Projection graph on pages
  number of nodes: 303
  number of edges: 44759


## Statistics on graph    

In [10]:
def compute_graph_statistics_on_nodes(graph,weight=None):
    network_df = pd.DataFrame(index=graph.nodes())
    
    centrality = nx.degree_centrality(graph)
    closeness = nx.closeness_centrality(graph)
    betweenness = nx.betweenness_centrality(graph, weight=weight)
    current_flow_closeness = nx.current_flow_closeness_centrality(graph, weight=weight)
    current_flow_betweenness = nx.current_flow_betweenness_centrality(graph, weight=weight)
    pagerank = nx.pagerank(graph,weight=weight)
    eigenvector = nx.eigenvector_centrality_numpy(graph, weight=weight)

    for index in network_df.index:
        network_df.ix[index,"degree"] = len(graph[index])
        network_df.ix[index,"pagerank"] = pagerank[index]
        network_df.ix[index,"centrality"] = centrality[index]
        network_df.ix[index,"closeness"] = closeness[index]
        network_df.ix[index,"betweenness"] = betweenness[index]
        network_df.ix[index,"current flow closeness"] = current_flow_closeness[index]
        network_df.ix[index,"current flow betweenness"] = current_flow_betweenness[index]
        network_df.ix[index,"eigenvector"] = eigenvector[index]

    return network_df
 
def compute_graph_statistics_on_pair_of_nodes(graph,weight=None):
    network_df = pd.DataFrame(index=[(x,y) for x in graph.nodes() for y in graph.nodes()])
    communicability = nx.communicability(graph)
    shortest_path = nx.shortest_path(graph,weight=weight)
    for k in graph.nodes()[0:5]:
        for v in graph.nodes()[0:5]:
            network_df.ix[(k,v),"communicability"] = communicability[k][v]
            network_df.ix[(k,v),"shortest_path"] = len(shortest_path[k][v])
    return(network_df)

def compute_graph_statistics(graph,prefix_file,weightNode=None,weightEdge=None):
    data_frame_node = pd.DataFrame()
    filename = "%s/%s-nodes-stats.csv" % (stat_dir,prefix_file)
    if (os.path.exists(filename)):
        data_frame_node = data_frame_node.from_csv(filename)
    else:
        data_frame_node = compute_graph_statistics_on_nodes(graph,weightNode)
        data_frame_node.to_csv(filename,encoding="utf-8")
    
    data_frame_pair = pd.DataFrame()
    filename = "%s/%s-pair-stats.csv" % (stat_dir,prefix_file)
    if (os.path.exists(filename)):
        data_frame_pair = data_frame_pair.from_csv(filename)
    else:
        data_frame_pair = compute_graph_statistics_on_pair_of_nodes(graph,weightEdge)
        data_frame_pair.to_csv(filename,encoding="utf-8")
    
    return(data_frame_node,data_frame_pair)

df_pro_graph_page_node,df_pro_graph_page_pair = compute_graph_statistics(projected_graph_page,'projected_graph_page')
df_links_graph_node,df_links_graph_pair = compute_graph_statistics(links_graph,'links_graph')



NetworkXError: Graph not connected.

#Intersection graph

Build a graph using links graph et projection: a edge between two page not far away in links graph et adjacent in projection 

In [None]:
def intersection_graph():
    global projected_graph_page, links_graph
    global df_links_graph_pair
    
    res = nx.Graph()
    for p in links_graph.nodes():
        
    

# Pageview and revisions

# reading map based on a reduced graph of co-edited pages