# Explore the ID Graph

This notebook contains some exploration of the ID graph generated by the clustering operation.

In [1]:
from concurrent.futures import ThreadPoolExecutor

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [4]:
from graph_tool.all import *



In [5]:
from bookdata import script_log, db, schema
from bookdata.graph import GraphLoader
_log = script_log('notebook')
_log.info('logging')

2020-04-07 10:34:57 [INFO   ] notebook logging


In [6]:
tp = ThreadPoolExecutor(8)

## Load the Graph

In [7]:
g = load_graph('data/id-graph.gt')

In [8]:
verts = pd.DataFrame({
    'code': g.vp.code.a,
    'source': g.vp.source.a,
    'cluster': g.vp.cluster.a
})
verts.head()

Unnamed: 0,code,source,cluster
0,908572387,9,0
1,908572388,9,1
2,908572389,9,2
3,908572390,9,3
4,908572391,9,4


In [9]:
nclusters = verts['cluster'].max() + 1
nclusters

12234574

In [10]:
gmin = g.copy()
for k in list(gmin.vp.keys()):
    del gmin.vp[k]
gmin

<Graph object, undirected, with 56079303 vertices and 47038041 edges at 0x7f2db18f96d0>

Count nodes in each cluster by type:

In [11]:
c_idx = pd.RangeIndex(nclusters, name='cluster')
def __count_sources(s):
    label = schema.src_label_rev[s]
    hist = verts.loc[verts['source'] == s, 'cluster'].value_counts()
    hist = hist.reindex(c_idx, fill_value=0)
    return (label, hist)
clusters = pd.DataFrame(dict(tp.map(__count_sources, verts['source'].unique())))
clusters.head()

Unnamed: 0_level_0,ISBN,LOC,OL-E,OL-W,GR-B,GR-W
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17,1,7,5,9,1
1,2,0,1,1,1,1
2,38,3,14,3,17,1
3,2,0,1,1,1,1
4,4,1,2,1,1,1


Compute the total size of each cluster:

In [12]:
clusters['Total'] = clusters.sum(axis=1)

Let's see the largest clusters:

In [13]:
clusters.nlargest(10, 'Total')

Unnamed: 0_level_0,ISBN,LOC,OL-E,OL-W,GR-B,GR-W,Total
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1218,7818,610,3432,455,2266,85,14666
1339613,7520,0,3760,1,0,0,11281
1339849,6518,7,3401,35,0,0,9961
301,3712,408,2007,322,883,78,7410
1341048,4276,0,2139,1,0,0,6416
325,3637,179,1464,30,983,16,6309
386,3545,162,1424,103,1004,27,6265
8036932,1,0,2368,2368,0,0,4737
1342210,2820,0,1410,1,0,0,4231
1264,1937,46,714,64,815,13,3589


And look at distributions of cluster sizes:

In [14]:
clusters.describe()

Unnamed: 0,ISBN,LOC,OL-E,OL-W,GR-B,GR-W,Total
count,12234570.0,12234570.0,12234570.0,12234570.0,12234570.0,12234570.0,12234570.0
mean,1.921051,0.4595574,1.16897,0.8094335,0.1340052,0.0906571,4.583674
std,5.314365,0.6767573,2.779625,0.9684458,1.265161,0.3009728,9.507801
min,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,0.0,1.0,0.0,0.0,0.0,3.0
50%,2.0,0.0,1.0,1.0,0.0,0.0,4.0
75%,2.0,1.0,1.0,1.0,0.0,0.0,5.0
max,7818.0,610.0,3760.0,2368.0,2266.0,85.0,14666.0


What are the largest clusters?

In [18]:
largest = clusters.nlargest(100, 'Total')
largest

Unnamed: 0_level_0,ISBN,LOC,OL-E,OL-W,GR-B,GR-W,Total
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1218,7818,610,3432,455,2266,85,14666
1339613,7520,0,3760,1,0,0,11281
1339849,6518,7,3401,35,0,0,9961
301,3712,408,2007,322,883,78,7410
1341048,4276,0,2139,1,0,0,6416
...,...,...,...,...,...,...,...
1367282,480,0,240,1,0,0,721
16800,348,13,125,56,156,9,707
16821,325,52,216,55,51,7,706
8572,371,11,133,14,169,3,701


Compute the diameter of each cluster:

In [20]:
def __cluster_diam(c):
    mask = verts['cluster'] == c
    vs, = np.where(mask)
    dmap = g.new_vp('int')
    shortest_distance(g, vs[0], dist_map=dmap)
    return np.max(dmap.a[vs])
largest['Diam'] = pd.Series(largest.index.values, index=largest.index).progress_apply(__cluster_diam)
largest

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Unnamed: 0_level_0,ISBN,LOC,OL-E,OL-W,GR-B,GR-W,Total,Diam
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1218,7818,610,3432,455,2266,85,14666,48
1339613,7520,0,3760,1,0,0,11281,4
1339849,6518,7,3401,35,0,0,9961,10
301,3712,408,2007,322,883,78,7410,38
1341048,4276,0,2139,1,0,0,6416,4
...,...,...,...,...,...,...,...,...
1367282,480,0,240,1,0,0,721,4
16800,348,13,125,56,156,9,707,17
16821,325,52,216,55,51,7,706,30
8572,371,11,133,14,169,3,701,14


And compute max normalized betweenness:

In [21]:
def __cluster_max_btw(c):
    mask = gmin.new_vp('bool')
    mask.a[:] = g.vp.cluster.a == c
    gv = GraphView(gmin, mask)
    gp = Graph(gv, prune=True)
    v_btw, e_btw = betweenness(gp)
    return np.max(v_btw.a)
largest['MaxBTW'] = pd.Series(largest.index.values, index=largest.index).progress_apply(__cluster_max_btw)
largest

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




Unnamed: 0_level_0,ISBN,LOC,OL-E,OL-W,GR-B,GR-W,Total,Diam,MaxBTW
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1218,7818,610,3432,455,2266,85,14666,48,0.497118
1339613,7520,0,3760,1,0,0,11281,4,0.999823
1339849,6518,7,3401,35,0,0,9961,10,0.999776
301,3712,408,2007,322,883,78,7410,38,0.527740
1341048,4276,0,2139,1,0,0,6416,4,0.999688
...,...,...,...,...,...,...,...,...,...
1367282,480,0,240,1,0,0,721,4,0.997218
16800,348,13,125,56,156,9,707,17,0.824019
16821,325,52,216,55,51,7,706,30,0.753363
8572,371,11,133,14,169,3,701,14,0.585267
