In [1]:
%matplotlib inline

from collections import OrderedDict

import json, codecs
import pandas as pd, numpy as np, matplotlib.pyplot as plt, networkx as nx, seaborn as sns

from networkx.algorithms import bipartite 

sns.set(style="whitegrid")

In [2]:
pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)

## computing page-editor bipartite graph

In [3]:
pages_editors_graph = nx.Graph()

editors = {}

def get_editors_set(page):
    editors = []
    revisions = json.load(codecs.open("data/revisions/%s.json" % (page), "r", "utf-8-sig"))
    
    revisions = pd.DataFrame(revisions)
    
    #print revisions.head()
    # editors = revisions[revisions["userid"] != 0]["user"].tolist()
    editors = revisions[revisions["userid"] != 0]["user"].tolist()
    
    return set(editors)

for p in pages:
    title = "p:%s" % (p)
    # page_graph.add_node(title)
    e = get_editors_set(title.split(":")[1])
    editors[title] = e

    pages_editors_graph.add_node(title, type="page")

    for editor in e:
        editor_label = "u:%s" % (editor)
        pages_editors_graph.add_node(editor_label, type="user")
        pages_editors_graph.add_edge(editor_label, title)

In [4]:
pages_nodes = [ x[0] for x in  pages_editors_graph.nodes(data=True) if x[1]["type"] == "page" ]
users_nodes = [ x[0] for x in  pages_editors_graph.nodes(data=True) if x[1]["type"] == "user" ]

print "page nodes: %s" % (len(pages_nodes))
print "user nodes: %s" % (len(users_nodes))

page nodes: 303
user nodes: 15858


### counting

In [5]:
print "nodes: %s" % (len(pages_editors_graph.nodes()))
print "edges: %s" % (len(pages_editors_graph.edges()))

nodes: 16161
edges: 39927


In [6]:
nx.write_gexf(pages_editors_graph, "data/pages-editors.gexf", encoding='utf-8')

In [7]:
def reduce_bipartite(G, select, weight="weight"):
    selected = [ x[0] for x in G.nodes(data=True) if x[1]["type"] == select ]
    results = bipartite.projected_graph(G, selected)

    for u in results.nodes():
        for v in results[u].keys():
            w = len(set(G[u]) & set(G[v]))
            results[u][v][weight] = w

    return results

## computing page-page graph

In [8]:
page_graph = reduce_bipartite(pages_editors_graph, "page", "coeditors")

### counting

In [9]:
print "nodes: %s" % (len(page_graph.nodes()))
print "edges: %s" % (len(page_graph.edges()))

nodes: 303
edges: 44688


### saving

In [10]:
nx.write_gexf(page_graph, "data/pages-linked-by-coeditors.gexf", encoding='utf-8')

## computing editor-editor graph

In [11]:
editors_graph = reduce_bipartite(pages_editors_graph, "user", "pages")

### counting

In [12]:
print "nodes: %s" % (len(editors_graph.nodes()))
print "edges: %s" % (len(editors_graph.edges()))

nodes: 15858
edges: 7491479


In [13]:
# nx.write_gexf(editors_graph, "data/wikipedia-geometry/editors-linked-by-pages.gexf")

## network statistics

In [14]:
network_df = pd.DataFrame(index=pages)
network_df.head()

2D computer graphics
2D geometric model
3D computer graphics
3D projection
3-sphere


### centrality

In [15]:
centrality = nx.degree_centrality(page_graph)
closeness = nx.closeness_centrality(page_graph)
betweenness = nx.betweenness_centrality(page_graph, weight="coeditors")
current_flow_closeness = nx.current_flow_closeness_centrality(page_graph, weight="coeditors")
current_flow_betweenness = nx.current_flow_betweenness_centrality(page_graph, weight="coeditors")
#eigenvector = nx.eigenvector_centrality(page_graph)
eigenvector = nx.eigenvector_centrality_numpy(page_graph, weight="coeditors")

for index in network_df.index:
    t = "p:%s" % (index)
    
    network_df.ix[index,"centrality"] = centrality[t]
    network_df.ix[index,"closeness"] = closeness[t]
    network_df.ix[index,"betweenness"] = betweenness[t]
    network_df.ix[index,"current flow closeness"] = current_flow_closeness[t]
    network_df.ix[index,"current flow betweenness"] = current_flow_betweenness[t]
    network_df.ix[index,"eigenvector"] = eigenvector[t]

network_df.head()

Unnamed: 0,centrality,closeness,betweenness,current flow closeness,current flow betweenness,eigenvector
2D computer graphics,0.970199,0.971061,0.000603,3.284882,0.002896,0.035668
2D geometric model,0.986755,0.986928,0.001794,2.618807,0.002197,0.01903
3D computer graphics,0.983444,0.983713,0.000119,3.97483,0.004239,0.058395
3D projection,0.993377,0.993421,1.5e-05,3.882415,0.004432,0.045342
3-sphere,0.990066,0.990164,0.0,4.029133,0.004879,0.050946


### exclusive editors

In [16]:
def get_exclusive_editors(title):
    nb = pages_editors_graph["p:%s" % (title)]
    # print nb.keys()
    result = [n for n in nb.keys() if len(pages_editors_graph[n]) > 1 ]
    return len(result)

# print network_df.index[0:10].map(get_exclusive_editors)

network_df["exclusive editors"] = map(get_exclusive_editors, network_df.index)

network_df["exclusive editors"].head()

2D computer graphics    103
2D geometric model       27
3D computer graphics    150
3D projection            77
3-sphere                102
Name: exclusive editors, dtype: int64

### storing the statistics

In [17]:
network_df.to_csv("data/pages-linked-by-coeditors.stats.csv", encoding="UTF-8")

# final report