In [18]:
import pywikibot
from pywikibot import pagegenerators
import igraph as ig
import igraph.remote.gephi as igg

In [19]:
site = pywikibot.Site()

In [20]:
def update_vertices(g: ig.Graph, vs):  # Because vertice ids are needed outside this function
                                       #  (and vertices have names), registering them is done outside, 
                                       #  for now
    if (type(vs) == dict):
        update_vertices(g, [vs])
        return
    
    vs_existing = [v for v in vs if v['id'] < g.vcount()]
    vs_new = [v for v in vs if v['id'] >= g.vcount()]

    if (len(vs_existing) > 0):
        for v_existing in vs_existing:
            for attr in v_existing.keys():
                g.vs[v_existing['id']][attr] = v_existing[attr]
    
    if (len(vs_new) > 0):
        attributes_new = dict()
        for attr in set.union(set(g.vs.attributes()), set(vs_new[0].keys())):  # This would break unless all vertices have the same fields
            try:  # Ugly, but the vertices we add are consistent...
                attributes_new[attr] = [v[attr] for v in vs_new]
            except KeyError:
                # print(f'Replacing None-attribute for {attr} in {[v["name"] for v in vs_new]}')
                if (type(g.vs[0][attr]) == int):
                    attributes_new[attr] = 0
                else:
                    attributes_new[attr] = ''
        attributes_new['Label'] = [v['name'] for v in vs_new]
        g.add_vertices(len(vs_new), attributes=attributes_new)


In [21]:
def update_edges(g: ig.Graph, es):
    if (type(es) == tuple):
        update_edges(g, [es])
        return

    es_ids = g.get_eids(pairs=es, error=False)
    es_ids_existing = [id for _, id in zip(es, es_ids) if id >= 0]
    es_new = [pair for pair, id in zip(es, es_ids) if id < 0]

    if (len(es_ids_existing) > 0):
        for id in es_ids_existing:
            g.es[id]['weight'] += 1
    
    if (len(es_new) > 0):
        for e in es_new:
            g.add_edge(e[0], e[1], weight=1)

In [22]:
def crawl_page_generator(g: ig.Graph, vs_id_gen: ig.UniqueIdGenerator, page_gen: pagegenerators.PageClassGenerator, group_vertex:dict = None):
    gen = site.preloadpages(page_gen, pageprops=True)
    for thispage in gen:
        this_vertex = {
            'name': thispage.title(), 
            'id': vs_id_gen[thispage.title()],
            'pageid': thispage.pageid,
            'revision_count': thispage.revision_count(),
            'namespace': str(thispage.namespace()),
            'categories': ', '.join(map(lambda x: x.title(), thispage.categories())),
            'contributors': ', '.join(thispage.contributors().keys())
            }
        #  backlinks(follow_redirects: bool = True, filter_redirects: Optional[bool] = None, namespaces=None, total: Optional[int] = None, content: bool = False, followRedirects='[deprecated name of follow_redirects]', filterRedirects='[deprecated name of filter_redirects]')[source]¶
        #  lastNonBotUser() → str[source]¶
        #   editTime()[source]¶
        #  extlinks(total: Optional[int] = None)[source]¶
        #  property oldest_revision¶
        update_vertices(g, this_vertex)
        
        if (group_vertex):
            update_edges(g, (group_vertex['id'], this_vertex['id']))
            g.vs[this_vertex['id']]['vertex_group'] = group_vertex['name']

        linkedpages = [{
                'name': linkedpage.title(),
                'id': vs_id_gen[linkedpage.title()]
            } for 
            linkedpage in thispage.linkedPages()
            ]
        update_vertices(g, linkedpages)
        update_edges(g, [ 
            (this_vertex['id'], linkedpage['id']) for 
            linkedpage in linkedpages
            ])

        backlinks = [
            {
                'name': linkingpage.title(), 
                'id': vs_id_gen[linkingpage.title()]
            } for 
            linkingpage in thispage.backlinks()
            ]
        update_vertices(g, backlinks)
        update_edges(g, [ 
            (linkingpage['id'], this_vertex['id']) for 
            linkingpage in backlinks
            ])

        print(f'{thispage.title()} ({this_vertex["id"]}): \n\tlinkedPages: {linkedpages}, \n\tbacklinks: {backlinks}\n')

In [23]:
def crawl_category(g: ig.Graph, vs_id_gen: ig.UniqueIdGenerator, site: pywikibot.Site, categoryname: str):
    cat = pywikibot.Category(site, categoryname)
    group_vertex = {
        'name': cat.title(), 
        'id': vs_id_gen[cat.title()], 
        'pageid': 0,
        'revision_count': 0,
        'namespace': str(cat.namespace()),
        'categories': '',
        'contributors': '',
        'vertex_group': ''
        }
    update_vertices(g, group_vertex)
    catgen = pagegenerators.CategorizedPageGenerator(cat)
    crawl_page_generator(g, vs_id_gen, catgen, group_vertex=group_vertex)

In [24]:
g = ig.Graph()
vs_id_gen = ig.UniqueIdGenerator()
es_id_gen = ig.UniqueIdGenerator()
gephi = igg.GephiConnection()
streamer = igg.GephiGraphStreamer()

In [25]:
crawl_category(g, vs_id_gen, site, 'Season 1')
streamer.post(g, gephi)

Retrieving 21 pages from mrrobot:mrrobot.


Comet Electric (1): 
	linkedPages: [{'name': 'Allsafe', 'id': 2}, {'name': 'E Corp', 'id': 3}, {'name': 'Elliot Alderson', 'id': 4}, {'name': 'Eps1.1 ones-and-zer0es.mpeg', 'id': 5}, {'name': 'Eps2.0 unm4sk-pt1.tc', 'id': 6}, {'name': 'Eps2.7 init5.fve', 'id': 7}, {'name': 'Five/Nine Hack', 'id': 8}, {'name': 'Fsociety', 'id': 9}, {'name': 'Mr. Robot', 'id': 10}, {'name': 'Steel Mountain', 'id': 11}], 
	backlinks: [{'name': 'Mr. Robot', 'id': 10}, {'name': 'Steel Mountain', 'id': 11}, {'name': 'Frank Cody', 'id': 12}]

Dark Army (13): 
	linkedPages: [{'name': 'Gao Xun', 'id': 14}, {'name': 'Mr. Williams', 'id': 15}, {'name': '401 Unauthorized', 'id': 16}, {'name': 'Allsafe', 'id': 2}, {'name': 'Angela Moss', 'id': 17}, {'name': 'Cisco', 'id': 18}, {'name': 'Darlene', 'id': 19}, {'name': 'Darlene Alderson', 'id': 20}, {'name': 'Deus Group', 'id': 21}, {'name': 'Dominique DiPierro', 'id': 22}, {'name': 'E Corp', 'id': 3}, {'name': 'E Corp Headquarters', 'id': 23}, {'name': 'Elliot Alders

In [26]:
crawl_category(g, vs_id_gen, site, 'Season 1 characters')
streamer.post(g, gephi)

Retrieving 32 pages from mrrobot:mrrobot.


Angela Moss (17): 
	linkedPages: [{'name': 'Emily Moss', 'id': 304}, {'name': '401 Unauthorized', 'id': 16}, {'name': 'Allsafe Cybersecurity', 'id': 275}, {'name': 'Antara Nayar', 'id': 229}, {'name': 'Cisco', 'id': 18}, {'name': 'Dark Army', 'id': 13}, {'name': 'Darlene', 'id': 19}, {'name': 'Dominique DiPierro', 'id': 22}, {'name': 'E Corp', 'id': 3}, {'name': 'Elliot Alderson', 'id': 4}, {'name': 'Eps1.0 hellofriend.mov', 'id': 78}, {'name': 'Eps1.1 ones-and-zer0es.mpeg', 'id': 5}, {'name': 'Eps1.2 d3bug.mkv', 'id': 96}, {'name': 'Eps1.3 da3m0ns.mp4', 'id': 97}, {'name': 'Eps1.4 3xpl0its.wmv', 'id': 24}, {'name': 'Eps1.5 br4ve-trave1er.asf', 'id': 98}, {'name': 'Eps1.6 v1ew-s0urce.flv', 'id': 99}, {'name': 'Eps1.7 wh1ter0se.m4v', 'id': 25}, {'name': 'Eps1.8 m1rr0r1ng.qt', 'id': 100}, {'name': 'Eps1.9 zer0-day.avi', 'id': 101}, {'name': 'Eps3.0 power-saver-mode.h', 'id': 30}, {'name': 'Fsociety', 'id': 9}, {'name': 'Gideon Goddard', 'id': 41}, {'name': 'Lloyd Chung', 'id': 105}, {'na

In [27]:
# import pickle

# with open('mrrobotgraph.pickle', 'wb') as handle:
#     pickle.dump(g, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('mrrobot_idgen.pickle', 'wb') as handle:
#     pickle.dump(id_gen, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [28]:
# with open('mrrobotgraph.pickle', 'rb') as handle:
#     g = pickle.load(handle)

# with open('mrrobot_idgen.pickle', 'rb') as handle:
#     id_gen = pickle.load(handle)

In [29]:
print(f'vertices: {len(g.vs)}, edges: {len(g.es)}')

print(g.vs[0])
print(g.vs[1])

# print(g)

vertices: 674, edges: 2259
igraph.Vertex(<igraph.Graph object at 0x7fc6eae425e0>, 0, {'pageid': 0, 'vertex_group': '', 'contributors': '', 'categories': '', 'namespace': ':Category:', 'revision_count': 0, 'name': 'Category:Season 1', 'id': 0, 'Label': 'Category:Season 1'})
igraph.Vertex(<igraph.Graph object at 0x7fc6eae425e0>, 1, {'pageid': 4617, 'vertex_group': 'Category:Season 1', 'contributors': 'Tsercele', 'categories': 'Category:Season 1, Category:Season 2, Category:Organizations', 'namespace': ':', 'revision_count': 3, 'name': 'Comet Electric', 'id': 1, 'Label': 'Comet Electric'})
