In [1]:
import networkx as nx
from scipy import sparse as sp
import numpy as np
import io
import pandas as pd
from glob import glob
from tqdm import tqdm_notebook
from DataRepresentation import Document
from collections import Counter

In [2]:
def read_texts(filename):
    with io.open(filename, newline='\n') as filin:
        return filin.readlines()

In [35]:
texts = read_texts('../../datasets/sentiment/3Label/vader_movie/texts.txt')
scores = read_texts('../../datasets/sentiment/3Label/vader_movie/score.txt')
scores = list(map(int, scores))

In [3]:
texts  = read_texts('../../datasets/topics/webkb/texts.txt')
scores = read_texts('../../datasets/topics/webkb/score.txt')
scores = list(map(int, scores))

In [20]:
texts = read_texts('../../datasets/sentiment/3Label/irony/texts.txt')
scores = read_texts('../../datasets/sentiment/3Label/irony/score.txt')
scores = list(map(int, scores))

# Representação dos Documentos
A representação Gs é a lista das representações dos documentos:

\begin{equation} Gs = [ ((LG, cooccur), (G, term) ) ] \end{equation}

onde
* $LG$ é a matriz do linegraph de co-ocorrência das arestas (co-ocorrência de co-ocorrências);
* $cooccur$ é a lista das co-ocorrências. Equivale a co-ocorrência respectiva a posição da linha e coluna na matriz $LG$;
* $G$ é a matriz do grafo de co-ocorrência dos termos;
* $term$ é a lista dos termos. Equivale ao termo respectivo a posição da linha e coluna na matriz $G$;

In [4]:
Gs = list(Document.build_sparse_matrices(texts, w=1, verbose=True))

Building documents: 100%|██████████| 8199/8199 [00:59<00:00, 137.83it/s]


In [5]:
Gs[0][0][1]

[('img', 'world'),
 ('click', 'img'),
 ('telegraph', 'world'),
 ('biggest', 'world'),
 ('time', 'world'),
 ('famou', 'world'),
 ('telegraph', 'time'),
 ('biggest', 'wast'),
 ('london', 'time'),
 ('time', 'york'),
 ('time', 'wast'),
 ('good', 'time'),
 ('time', 'work'),
 ('famou', 'rec'),
 ('click', 'pictur'),
 ('pictur', 'resum'),
 ('resum', 'work'),
 ('page', 'work'),
 ('place', 'work'),
 ('summer', 'work'),
 ('come', 'page'),
 ('page', 'web'),
 ('page', 'www'),
 ('mail', 'page'),
 ('issu', 'place'),
 ('corpor', 'place'),
 ('good', 'summer'),
 ('put', 'summer'),
 ('back', 'come'),
 ('check', 'web'),
 ('web', 'www'),
 ('mail', 'sparekh'),
 ('back', 'newspap'),
 ('net', 'newspap'),
 ('check', 'net'),
 ('check', 'london'),
 ('check', 'plug'),
 ('plug', 'put'),
 ('rec', 'sport'),
 ('cricket', 'sport'),
 ('cricket', 'forum'),
 ('cricket', 'out'),
 ('cricket', 'word'),
 ('depth', 'forum'),
 ('in', 'out'),
 ('observ', 'word'),
 ('depth', 'enlighten'),
 ('discuss', 'enlighten'),
 ('discuss', 

In [6]:
len(texts)

8199

In [7]:
df_term = Counter()
list(map(df_term.update, [ nodes_G for ((LG,nodes_LG),(G,nodes_G)) in Gs ]))
termid = { t:i for (i,t) in enumerate(df_term.keys()) }
idterm = list(zip(*sorted(termid.items(), key=lambda x: x[1])))[0]

df_term_co = Counter()
list(map(df_term_co.update, [ nodes_LG for ((LG,nodes_LG),(G,nodes_G)) in Gs ]))
termid_co = { t:i for (i,t) in enumerate(df_term_co.keys()) }
idterm_co = list(zip(*sorted(termid_co.items(), key=lambda x: x[1])))[0]
x_cord, y_cord = list(zip(*[ (termid[s], termid[t]) for s,t in idterm_co ]))

cf = Counter(scores)
idclass = list(cf.keys())
classid = { c:i for (i,c) in enumerate(idclass) }

In [8]:
print("Número de termos: ", len(termid))
print("Número de co-ocorrências: ", len(termid_co))
print("Número de classes: ", len(idclass))

Número de termos:  16887
Número de co-ocorrências:  582247
Número de classes:  7


In [9]:
M = [ sp.csr_matrix((len(termid),len(termid)), dtype=np.float64) for _ in idclass ]
Adj = [ sp.csr_matrix((len(termid_co),len(termid_co)), dtype=np.float64) for _ in idclass ]

Gs_resized  = []
LGs_resized = []

In [10]:
def refactor_matrix(m, nodelist, mapper):
    if len(nodelist) < 1 or len(m.nonzero()[0]) < 1:
        return sp.csr_matrix( (len(mapper), len(mapper)) )
    nodes_term_id = [ mapper[t] for t in nodelist ]
    data = np.array(m[m.nonzero()])[0]
    (row_ind, col_ind) = m.nonzero()
    row_ind = [ nodes_term_id[k] for k in row_ind ]
    col_ind = [ nodes_term_id[k] for k in col_ind ]

    return sp.csr_matrix((data, (row_ind, col_ind)), dtype=np.float64, shape=(len(mapper), len(mapper)))

In [11]:
for y, ((LG,nodes_LG),(G,nodes_G)) in tqdm_notebook(zip(scores, Gs), total=len(Gs)):
    y = classid[y]
    
    M_doc = refactor_matrix(G, nodes_G, termid)
    M[y] += M_doc
    
    Adj_doc = refactor_matrix(LG, nodes_LG, termid_co)
    Adj[y] += Adj_doc
    




In [12]:
def elementwise_div(X,Y):
    data = np.array(X[ Y.nonzero() ] / Y[ Y.nonzero() ])[0]
    return sp.csr_matrix( (data, Y.nonzero()) )
def norm_axis(X, axis=1):
    new_matrix = sp.csr_matrix( X.shape )
    new_matrix[ X.nonzero() ] = X[ X.nonzero() ] / X.sum(axis=axis)
    return new_matrix

In [13]:
sum_adj = np.sum(Adj)
sum_M   = np.sum(M)

In [14]:
sum_adj, sum_M

(<582247x582247 sparse matrix of type '<class 'numpy.float64'>'
 	with 14862916 stored elements in Compressed Sparse Row format>,
 <16887x16887 sparse matrix of type '<class 'numpy.float64'>'
 	with 1161113 stored elements in Compressed Sparse Row format>)

In [30]:
%%time
nsum_adj = elementwise_div(sum_adj, sum_adj)
nadj = []
for adj in Adj:
    nadj_ = elementwise_div(adj, sum_adj)
    #nadj_ = elementwise_div(nadj_, (nsum_adj - nadj_))
    nadj_ = nadj_ - (nsum_adj - nadj_)
    nadj.append(nadj_)

CPU times: user 10.3 s, sys: 3.07 s, total: 13.4 s
Wall time: 13.4 s


In [31]:
%%time
nM = []
nsum_M = elementwise_div(sum_M, sum_M)
for m in M:
    nM_ = elementwise_div(m, sum_M)
    nM_ = nM_ - (nsum_M - nM_)
    nM.append(nM_)

CPU times: user 989 ms, sys: 7.15 ms, total: 996 ms
Wall time: 995 ms


In [32]:
from sklearn.metrics.pairwise import paired_distances

In [33]:
for i in range(len(nadj)):
    adj_cosine = (1.-paired_distances(Adj_doc, nadj[i], metric='cosine'))
    adj_comp = sp.csr_matrix( (adj_cosine, (x_cord,y_cord)), shape=(len(termid), len(termid)) )
    adj_comp[y_cord,x_cord] = adj_cosine
    
    
    m_mean_sec   = (1.-paired_distances(adj_comp, nM[i], metric='cosine'))
    m_mean   = (1.-paired_distances(M_doc, nM[i], metric='cosine'))
    print( y == i, "\n\tcos(adj_doc, adj[i]) = %.5f;\n\tcos(M_doc, M[i]) = %.5f;\n\tcos^2(M_doc, M[i]) = %.5f" % (adj_comp.mean(), m_mean.mean(), m_mean_sec.mean()) )
    

  after removing the cwd from sys.path.


True 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49859;
	cos^2(M_doc, M[i]) = -0.74285
False 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49777;
	cos^2(M_doc, M[i]) = 0.14291
False 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49774;
	cos^2(M_doc, M[i]) = -0.84190
False 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49774;
	cos^2(M_doc, M[i]) = -0.97583
False 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49774;
	cos^2(M_doc, M[i]) = -0.88086
False 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49773;
	cos^2(M_doc, M[i]) = -0.74128
False 
	cos(adj_doc, adj[i]) = 0.00204;
	cos(M_doc, M[i]) = 0.49774;
	cos^2(M_doc, M[i]) = -0.98229


In [35]:
from sklearn.metrics.pairwise import paired_cosine_distances

In [37]:
for t, (y, ((LG,nodes_LG),(G,nodes_G))) in tqdm_notebook(enumerate(zip(scores, Gs)), total=len(Gs)):
    y = classid[y]
    
    print(t)
    M_doc = refactor_matrix(G, nodes_G, termid)
    Adj_doc = refactor_matrix(LG, nodes_LG, termid_co)
    
    for i in range(len(nadj)):
        adj_cosine = (1.-paired_distances(Adj_doc, nadj[i], metric='cosine'))
        adj_comp = sp.csr_matrix( (adj_cosine, (x_cord,y_cord)), shape=(len(termid), len(termid)) )
        adj_comp[y_cord,x_cord] = adj_cosine

        m_mean_sec   = (1.-paired_distances(adj_comp, nM[i], metric='cosine'))
        m_mean   = (1.-paired_distances(M_doc, nM[i], metric='cosine'))
        sim_means = (1.-paired_cosine_distances(m_mean, m_mean_sec))
        print('\t', idclass[i], y == i, "\n\t\tcos(adj_doc, adj[i]) = %.5f;\n\t\tcos(M_doc, M[i]) = %.5f;\n\t\tcos^2(M_doc, M[i]) = %.5f" % (adj_comp.mean(), m_mean.mean(), m_mean_sec.mean()) )
    

0


  # This is added back by InteractiveShellApp.init_path()


ValueError: Expected 2D array, got 1D array instead:
array=[ 0.03152909 -0.00410781 -0.00448678 ...  0.5         0.5
  0.5       ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [66]:
%%time
nadj = []
for adj in Adj:
    nadj.append(elementwise_div(adj, sum_adj))

  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 1min 4s, sys: 4.12 s, total: 1min 8s
Wall time: 1min 8s


In [31]:
sum_adj2 = sp.linalg.norm(sum_adj, axis=0, ord=0)

array([0.9216558 , 0.84767707, 1.        , ..., 1.        , 1.        ,
       1.        ])

In [97]:
ajd_0_norm = true_elementwise_div(Adj[0], sum_adj)
ajd_1_norm = true_elementwise_div(Adj[1], sum_adj)
ajd_2_norm = true_elementwise_div(Adj[2], sum_adj)

In [88]:
val = np.repeat(sum_adj, Adj[0].getnnz())
Adj[0].data /= val

TypeError: ufunc 'true_divide' output (typecode 'O') could not be coerced to provided output parameter (typecode 'd') according to the casting rule ''same_kind''

In [82]:
ajd_0_norm = sp.dok_matrix( sum_adj.shape )
ajd_0_norm[ sum_adj.nonzero() ] = Adj[0][sum_adj.nonzero()] / sum_adj[sum_adj.nonzero()]

ajd_1_norm = sp.dok_matrix( sum_adj.shape )
ajd_1_norm[ sum_adj.nonzero() ] = Adj[1][sum_adj.nonzero()] / sum_adj[sum_adj.nonzero()]

ajd_2_norm = sp.dok_matrix( sum_adj.shape )
ajd_2_norm[ sum_adj.nonzero() ] = Adj[2][sum_adj.nonzero()] / sum_adj[sum_adj.nonzero()]

In [76]:
classid

{-1: 1, 0: 2, 1: 0}

In [86]:
i = 1000

for i in range(len(scores)):
    p1 = (1.-paired_distances(ajd_0_norm, LGs_resized[i], metric='cosine')).sum()
    p0 = (1.-paired_distances(ajd_2_norm, LGs_resized[i], metric='cosine')).sum()
    pm1 = (1.-paired_distances(ajd_1_norm, LGs_resized[i], metric='cosine')).sum()
    
    print("%s 1: %.4f 0: %.4f -1: %.4f" % (scores[i], p1, p0, pm1))

1 1: 52113.8116 0: 70596.0000 -1: 52721.6325
1 1: 52114.4252 0: 70594.0000 -1: 52718.2518
-1 1: 52106.0000 0: 70602.5000 -1: 52728.5000
1 1: 52108.7071 0: 70601.0000 -1: 52725.0000
1 1: 52110.0972 0: 70599.5000 -1: 52723.5000
1 1: 52110.7887 0: 70599.0000 -1: 52723.0000
1 1: 52108.2845 0: 70601.5000 -1: 52725.5000
1 1: 52109.8594 0: 70599.5000 -1: 52723.9273
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52109.4082 0: 70600.5000 -1: 52724.5000
1 1: 52108.5774 0: 70601.5000 -1: 52725.5000
1 1: 52109.3162 0: 70600.5000 -1: 52724.5000
1 1: 52108.8536 0: 70601.0000 -1: 52725.0000
1 1: 52108.1547 0: 70601.0000 -1: 52725.0000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52107.4082 0: 70602.5000 -1: 52726.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52108.8742 0: 70601.0000 -1: 52725.8458
1 1: 52108.1543 0: 70601.5000 -1: 52725.5000
1 1: 52111.1876 0: 70597.0000 -1: 52721.0000
1 1: 5211

1 1: 52107.5774 0: 70602.5000 -1: 52726.5000
1 1: 52108.7132 0: 70601.0000 -1: 52725.0000
1 1: 52108.7071 0: 70601.5000 -1: 52725.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52108.1197 0: 70601.5000 -1: 52725.5000
1 1: 52113.8854 0: 70595.0000 -1: 52719.0000
1 1: 52110.5774 0: 70599.5000 -1: 52723.5000
1 1: 52110.4142 0: 70599.5000 -1: 52723.5000
1 1: 52109.3015 0: 70600.5000 -1: 52724.5000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
0 1: 52101.5000 0: 70609.0000 -1: 52722.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52112.0000 0: 70598.5000 -1: 52722.5000
1 1: 52109.8165 0: 70600.5000 -1: 52724.5000
1 1: 52111.0774 0: 70599.0000 -1: 52723.0000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52108.4009 0: 70601.5000 -1: 52725.5000
1 1: 52108.4000 0: 70601.5000 -1: 52725.5000
1 1: 52109.0000 0: 70601.5000 -1: 52725.5000
1 1: 52111.4650 0: 70598.5000 -1: 52722.5000
1 1: 52109.7556 0: 70600.0000 -1: 52725.1715
1 1: 52115.6325 0: 70594.5000 -1: 52718.5000
1 1: 52110

1 1: 52109.3938 0: 70600.5000 -1: 52724.5000
1 1: 52109.5919 0: 70599.0000 -1: 52723.2088
1 1: 52108.5345 0: 70601.5000 -1: 52725.5000
1 1: 52111.5000 0: 70599.0000 -1: 52723.0000
1 1: 52108.7845 0: 70601.0000 -1: 52725.0000
1 1: 52110.5236 0: 70599.5000 -1: 52723.5000
1 1: 52107.8780 0: 70602.0000 -1: 52726.0000
1 1: 52111.0236 0: 70599.0000 -1: 52723.0000
1 1: 52110.3047 0: 70599.0000 -1: 52723.0000
1 1: 52112.6715 0: 70596.5000 -1: 52721.2634
1 1: 52110.6494 0: 70599.0000 -1: 52723.0000
1 1: 52108.9264 0: 70601.0000 -1: 52725.0000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
1 1: 52107.2425 0: 70602.5000 -1: 52726.5000
1 1: 52109.8165 0: 70600.5000 -1: 52724.5000
1 1: 52110.6332 0: 70599.0000 -1: 52723.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52109.3108 0: 70598.5000 -1: 52722.5000
1 1: 52113.6192 0: 70595.5000 -1: 52720.0123
1 1: 52109.1228 0: 70599.5000 -1: 52723.5523
1 1: 52109.2071 0: 70601.0000 -1: 52725.0000
1 1: 52112

1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52108.6547 0: 70601.5000 -1: 52725.5000
1 1: 52108.8165 0: 70601.5000 -1: 52725.5000
1 1: 52110.8165 0: 70599.5000 -1: 52723.5000
1 1: 52108.3355 0: 70601.0000 -1: 52725.0000
1 1: 52111.7071 0: 70598.5000 -1: 52722.5000
1 1: 52110.7747 0: 70599.0000 -1: 52723.0000
1 1: 52111.2071 0: 70599.0000 -1: 52723.0000
1 1: 52114.2071 0: 70596.0000 -1: 52720.0000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52111.5000 0: 70599.0000 -1: 52723.0000
1 1: 52111.9255 0: 70597.5000 -1: 52721.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52109.7637 0: 70600.0000 -1: 52724.0000
1 1: 52111.2071 0: 70599.0000 -1: 52723.0000
1 1: 52110.5000 0: 70599.5000 -1: 52723.5000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
1 1: 52108.7071 0: 70601.5000 -1: 52725.5000
1 1: 52112.3165 0: 70598.0000 -1: 52722.0000
1 1: 52108.9472 0: 70601.0000 -1: 52725.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52108.8165 0: 70601.5000 -1: 52725.5000
1 1: 52113

1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52111.0345 0: 70599.0000 -1: 52723.0000
1 1: 52114.6325 0: 70595.0000 -1: 52719.0000
1 1: 52107.5774 0: 70602.5000 -1: 52726.5000
1 1: 52112.3351 0: 70596.0000 -1: 52720.0000
1 1: 52111.7098 0: 70597.5000 -1: 52721.5000
1 1: 52108.5104 0: 70601.0000 -1: 52725.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52113.9142 0: 70596.0000 -1: 52720.0000
1 1: 52107.6786 0: 70601.5000 -1: 52725.5000
1 1: 52112.5774 0: 70597.5000 -1: 52721.5000
1 1: 52110.5000 0: 70599.5000 -1: 52723.5000
1 1: 52108.3288 0: 70601.5000 -1: 52725.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52111.3938 0: 70598.5000 -1: 52722.5000
1 1: 52111.6670 0: 70598.0000 -1: 52722.0000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52111.5585 0: 70597.0000 -1: 52721.3304
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52108.5407 0: 70601.0000 -1: 52725.0000
1 1: 52109

1 1: 52112.8165 0: 70597.5000 -1: 52721.5000
-1 1: 52101.0000 0: 70597.5000 -1: 52733.2071
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52111.4181 0: 70597.0000 -1: 52721.0000
1 1: 52111.5000 0: 70599.0000 -1: 52723.0000
1 1: 52110.7071 0: 70599.5000 -1: 52723.5000
1 1: 52108.7071 0: 70601.5000 -1: 52725.5000
1 1: 52111.7921 0: 70596.5000 -1: 52720.5000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
1 1: 52108.3435 0: 70600.5000 -1: 52724.5000
1 1: 52109.9142 0: 70600.0000 -1: 52724.0000
1 1: 52109.0784 0: 70599.5000 -1: 52723.5000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52107.6317 0: 70602.0000 -1: 52726.5101
1 1: 52108.3536 0: 70601.5000 -1: 52725.5000
1 1: 52112.9472 0: 70597.0000 -1: 52721.0000
1 1: 52109.2582 0: 70600.5000 -1: 52724.5000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52109.8118 0: 70599.0000 -1: 52723.0000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
-1 1: 52102.5000 0: 70599.0000 -1: 52731.7746
1 1: 52111.4487 0: 70598.0000 -1: 52722.0000
1 1: 521

1 1: 52109.5679 0: 70600.0000 -1: 52724.0000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
1 1: 52109.9472 0: 70600.0000 -1: 52724.0000
1 1: 52110.6975 0: 70599.0000 -1: 52723.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52112.0000 0: 70598.5000 -1: 52722.5000
1 1: 52110.0714 0: 70598.5000 -1: 52723.1349
1 1: 52108.9916 0: 70600.5000 -1: 52724.5000
1 1: 52114.7071 0: 70595.5000 -1: 52719.5000
1 1: 52109.5000 0: 70600.5000 -1: 52724.5000
1 1: 52112.8938 0: 70596.5000 -1: 52720.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52109.7098 0: 70600.0000 -1: 52724.0000
1 1: 52111.0774 0: 70599.0000 -1: 52723.0000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52109.7845 0: 70599.5000 -1: 52723.5000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52112.0000 0: 70598.5000 -1: 52722.5000
1 1: 52107.5774 0: 70602.5000 -1: 52726.5000
1 1: 52107.5246 0: 70602.0000 -1: 52726.0000
1 1: 52113.6919 0: 70595.5000 -1: 52719.5000
1 1: 52112.1364 0: 70597.5000 -1: 52721.5000
1 1: 52110

1 1: 52108.8430 0: 70601.0000 -1: 52725.0000
1 1: 52109.2071 0: 70601.0000 -1: 52725.0000
1 1: 52112.2071 0: 70598.0000 -1: 52722.0000
1 1: 52108.7637 0: 70601.0000 -1: 52725.0000
1 1: 52110.5607 0: 70599.0000 -1: 52723.0000
-1 1: 52106.0000 0: 70602.5000 -1: 52728.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52114.5000 0: 70595.5000 -1: 52719.5000
1 1: 52110.7071 0: 70599.5000 -1: 52723.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
-1 1: 52102.5000 0: 70599.0000 -1: 52732.0000
1 1: 52109.0000 0: 70601.5000 -1: 52725.5000
1 1: 52109.1543 0: 70599.5000 -1: 52723.5000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
1 1: 52109.5038 0: 70600.0000 -1: 52724.0000
1 1: 52110.7572 0: 70598.5000 -1: 52722.5000
1 1: 52109.8396 0: 70600.0000 -1: 52724.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52107.7500 0: 70602.0000 -1: 52726.0000
1 1: 52109.0000 0: 70601.5000 -1: 52725.5000
1 1: 52111.5774 0: 70598.5000 -1: 52722.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 521

1 1: 52108.2071 0: 70602.0000 -1: 52726.0000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
1 1: 52112.1547 0: 70597.0000 -1: 52721.0000
1 1: 52111.4009 0: 70598.5000 -1: 52722.5000
1 1: 52108.4082 0: 70601.5000 -1: 52725.5000
1 1: 52110.2429 0: 70599.0000 -1: 52723.0000
1 1: 52108.5000 0: 70601.5000 -1: 52725.5000
1 1: 52110.2694 0: 70599.0000 -1: 52723.0000
1 1: 52112.8614 0: 70596.5000 -1: 52720.5000
1 1: 52111.5000 0: 70599.0000 -1: 52723.0000
1 1: 52109.3780 0: 70600.5000 -1: 52724.5000
1 1: 52111.9609 0: 70596.0000 -1: 52720.0000
1 1: 52107.7845 0: 70602.0000 -1: 52726.0000
1 1: 52111.4597 0: 70597.0000 -1: 52722.7634
1 1: 52112.8922 0: 70597.0000 -1: 52721.0000
1 1: 52114.4184 0: 70594.5000 -1: 52718.5000
1 1: 52109.6042 0: 70599.5000 -1: 52723.5000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52110.3165 0: 70599.5000 -1: 52723.5000
1 1: 52109.7071 0: 70600.5000 -1: 52724.5000
1 1: 52110.9737 0: 70597.0000 -1: 52721.0000
1 1: 52109.1325 0: 70601.0000 -1: 52725.0000
1 1: 52111

1 1: 52107.0000 0: 70603.5000 -1: 52727.5000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
1 1: 52112.1213 0: 70597.5000 -1: 52721.5000
1 1: 52109.0000 0: 70601.5000 -1: 52725.5000
1 1: 52109.0000 0: 70601.5000 -1: 52725.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52111.5000 0: 70599.0000 -1: 52723.0000
1 1: 52107.0000 0: 70603.5000 -1: 52727.5000
1 1: 52109.6317 0: 70599.5000 -1: 52723.5000
1 1: 52110.1325 0: 70600.0000 -1: 52724.0000
1 1: 52108.3922 0: 70601.5000 -1: 52725.5000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52110.8165 0: 70599.5000 -1: 52723.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52111.5458 0: 70598.0000 -1: 52722.0000
1 1: 52114.1019 0: 70595.0000 -1: 52719.0000
1 1: 52107.5000 0: 70602.5000 -1: 52726.5000
1 1: 52113.0000 0: 70597.5000 -1: 52721.5000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
1 1: 52110.4558 0: 70598.5000 -1: 52722.5000
1 1: 52110.6180 0: 70599.0000 -1: 52723.0000
1 1: 52111.3165 0: 70599.0000 -1: 52723.0000
-1 1: 5210

1 1: 52113.1325 0: 70597.0000 -1: 52721.0000
1 1: 52112.0000 0: 70598.5000 -1: 52722.5000
1 1: 52109.0173 0: 70599.5000 -1: 52723.5000
1 1: 52110.2071 0: 70600.0000 -1: 52724.0000
1 1: 52107.8536 0: 70602.0000 -1: 52726.0000
1 1: 52107.6335 0: 70602.0000 -1: 52726.0000
1 1: 52108.1881 0: 70601.5000 -1: 52725.5000
1 1: 52112.0947 0: 70598.1387 -1: 52721.0000
1 1: 52112.3305 0: 70597.0000 -1: 52721.0000
1 1: 52112.0499 0: 70597.0000 -1: 52721.0000
1 1: 52116.0000 0: 70594.5000 -1: 52718.5000
1 1: 52114.1325 0: 70596.0000 -1: 52720.0000
1 1: 52113.7269 0: 70595.5000 -1: 52719.5000
0 1: 52104.5000 0: 70606.0000 -1: 52725.0000
1 1: 52109.1547 0: 70600.0000 -1: 52724.0000
1 1: 52114.2071 0: 70596.0000 -1: 52720.0000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52113.2071 0: 70597.0000 -1: 52721.0000
1 1: 52108.7071 0: 70601.5000 -1: 52725.5000
1 1: 52109.9472 0: 70600.0000 -1: 52724.0000
1 1: 52111.3243 0: 70598.5000 -1: 52722.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52113

1 1: 52113.0000 0: 70597.5000 -1: 52721.5000
1 1: 52108.6325 0: 70601.5000 -1: 52725.5000
1 1: 52108.7637 0: 70601.0000 -1: 52725.0000
1 1: 52108.8780 0: 70601.0000 -1: 52725.0000
1 1: 52109.0774 0: 70601.0000 -1: 52725.0000
1 1: 52109.8366 0: 70599.5000 -1: 52723.5000
1 1: 52112.0000 0: 70598.5000 -1: 52722.5000
1 1: 52109.7071 0: 70600.5000 -1: 52724.5000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
1 1: 52107.3789 0: 70602.0000 -1: 52726.0000
1 1: 52112.5263 0: 70597.0000 -1: 52721.0000
-1 1: 52105.5000 0: 70602.0000 -1: 52727.7968
1 1: 52109.8396 0: 70600.0000 -1: 52724.0000
1 1: 52112.4107 0: 70597.0000 -1: 52721.0000
1 1: 52108.5774 0: 70601.5000 -1: 52725.5000
1 1: 52110.5000 0: 70600.0000 -1: 52724.0000
1 1: 52109.3250 0: 70600.0000 -1: 52724.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
-1 1: 52103.5000 0: 70600.0000 -1: 52731.0000
1 1: 52109.8611 0: 70599.5000 -1: 52723.5000
1 1: 52112.7307 0: 70597.0000 -1: 52721.0000
1 1: 52107.3780 0: 70602.5000 -1: 52726.5000
1 1: 521

1 1: 52111.6995 0: 70598.0000 -1: 52722.0000
1 1: 52111.4142 0: 70598.5000 -1: 52722.5000
1 1: 52111.9553 0: 70597.5000 -1: 52721.5000
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52109.5774 0: 70600.5000 -1: 52724.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52110.1547 0: 70599.0000 -1: 52723.0000
1 1: 52113.5727 0: 70594.5000 -1: 52718.5000
1 1: 52111.5000 0: 70599.0000 -1: 52723.0000
1 1: 52112.3900 0: 70597.5000 -1: 52721.5000
1 1: 52110.5851 0: 70599.0000 -1: 52723.0000
1 1: 52112.5774 0: 70597.5000 -1: 52721.5000
1 1: 52108.9490 0: 70601.0000 -1: 52725.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52110.0000 0: 70600.5000 -1: 52724.5000
1 1: 52108.5774 0: 70601.5000 -1: 52725.5000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52112.1213 0: 70597.5000 -1: 52721.5000
1 1: 52111.7071 0: 70598.5000 -1: 52722.5000
1 1: 52113.7071 0: 70596.5000 -1: 52720.5000
1 1: 52110.4082 0: 70599.5000 -1: 52723.5000
1 1: 52108.8536 0: 70601.0000 -1: 52725.0000
1 1: 52107

1 1: 52113.0000 0: 70597.0000 -1: 52721.0000
1 1: 52107.0000 0: 70603.5000 -1: 52727.5000
1 1: 52108.0000 0: 70602.0000 -1: 52726.0000
1 1: 52110.5553 0: 70599.0000 -1: 52723.0000
1 1: 52112.3555 0: 70597.0000 -1: 52721.0000
1 1: 52114.9016 0: 70594.0000 -1: 52718.0000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
1 1: 52110.6767 0: 70598.5000 -1: 52722.5000
1 1: 52114.2071 0: 70596.0000 -1: 52720.0000
1 1: 52111.8165 0: 70598.5000 -1: 52722.5000
1 1: 52111.7105 0: 70597.5000 -1: 52721.5000
1 1: 52109.7462 0: 70600.0000 -1: 52724.0000
1 1: 52113.4151 0: 70596.0000 -1: 52720.0000
1 1: 52110.8165 0: 70599.5000 -1: 52723.5000
-1 1: 52103.5000 0: 70600.0000 -1: 52731.0000
1 1: 52111.0000 0: 70599.0000 -1: 52723.0000
1 1: 52112.6325 0: 70597.5000 -1: 52721.5000
1 1: 52108.5000 0: 70602.0000 -1: 52726.0000
1 1: 52108.0000 0: 70602.5000 -1: 52726.5000
1 1: 52108.3104 0: 70601.0000 -1: 52725.1854
1 1: 52108.7845 0: 70601.0000 -1: 52725.0000
1 1: 52108.1439 0: 70601.0000 -1: 52725.0000
1 1: 5210

1 1: 52109.4712 0: 70600.0000 -1: 52724.0000
1 1: 52111.4319 0: 70598.0000 -1: 52722.0000
1 1: 52112.4190 0: 70597.0000 -1: 52723.8186
1 1: 52112.9490 0: 70597.0000 -1: 52721.0000
1 1: 52108.3938 0: 70601.5000 -1: 52725.5000
1 1: 52109.8355 0: 70599.5000 -1: 52723.5000
1 1: 52107.5774 0: 70602.5000 -1: 52726.5000
1 1: 52113.0345 0: 70597.0000 -1: 52721.0000
1 1: 52108.5233 0: 70601.0000 -1: 52725.0000
1 1: 52107.0000 0: 70603.5000 -1: 52727.5000
1 1: 52110.0774 0: 70600.0000 -1: 52724.0000
1 1: 52109.4472 0: 70600.5000 -1: 52724.5000
1 1: 52111.0000 0: 70599.5000 -1: 52723.5000
-1 1: 52101.6187 0: 70597.5000 -1: 52733.4487
1 1: 52109.5000 0: 70601.0000 -1: 52725.0000
1 1: 52112.3081 0: 70597.0000 -1: 52721.0000
1 1: 52112.0774 0: 70598.0000 -1: 52722.0000
1 1: 52113.8165 0: 70596.5000 -1: 52720.5000
1 1: 52113.8165 0: 70596.5000 -1: 52720.5000
1 1: 52112.5000 0: 70598.0000 -1: 52722.0000
1 1: 52109.2951 0: 70600.0000 -1: 52724.0000
1 1: 52113.1561 0: 70596.5000 -1: 52720.5000
1 1: 5211

KeyboardInterrupt: 

In [21]:
for LG_ in LGs_resized:
    print((1.-paired_distances(LG_, sum_adj, metric='cosine')).sum()/LG_.shape[0])

0.5000842750152398
0.5001097723481658
0.5000141312795874
0.5000228270341504
0.5000420635163532
0.5000485624604685
0.5000155578835768
0.5000400707686462
0.5000494594785558
0.5000340316299083
0.5000222899776611
0.5000320393025071
0.500025665622356
0.5000150208270874
0.5000211969193811
0.5000494594785558
0.5000211969193811
0.5000057690707336
0.5000494594785558
0.5000295498233615
0.5000147027501523
0.5000551245585442
0.5000522074620778
0.5000353281989685
0.500019127441399
0.5000211969193811
0.5000265835432366
0.5000942833104909
0.5000665734608992
0.5000282625591748
0.5000565251183494
0.5000141312795874
0.5000453205225914
0.5000519342382929
0.5000141312795874
0.5000717494562169
0.5000423938387621
0.5000885985366824
0.5000858807358043
0.5000468663404356
0.5000453205225914
0.5000431018717088
0.5000406545957534
0.5000847876775242
0.5000241236032104
0.500017646081256
0.500019900350321
0.5000663139311926
0.5000282625591748
0.5000166293604932
0.500065074322257
0.5000777220377305
0.500084787677524

0.5000513312447118
0.5000494594785558
0.5000353281989685
0.5000423938387621
0.5000542728042829
0.5000321086063305
0.50008174177984
0.5000544556403673
0.5000396814867871
0.5000225656622059
0.5000256694210545
0.5000406693206106
0.5000665174419725
0.5000282625591748
0.5000353281989685
0.5000651716632941
0.5000386020016375
0.5001051327209476
0.5000172149776053
0.5000082678054716
0.5000407470796626
0.5000353281989685
0.5000387764705131
0.5000691563180967
0.5000634715442883
0.5000576181766295
0.5000012253373413
0.5
0.5000665230905122
0.5000847876775242
0.5000183363171065
0.5000293556174548
0.5000398007006419
0.5000495859064602
0.5000576181766295
0.5000519194185821
0.5000152243378674
0.5000517547025138
0.5000382548827977
0.5000363253687371
0.5000141312795874
0.5000494594785558
0.5000954275752335
0.5000252166614906
0.5000107922020268
0.5000211969193811
0.5000303177404813
0.5000282625591748
0.5000035328198968
0.5000453205225914
0.5000478372796735
0.5000412345829758
0.5000411316837776
0.50008771

0.5000645639962168
0.5000317953790716
0.500071315891958
0.5000765097655956
0.5000494594785558
0.5000581300905862
0.5000594518021788
0.5000464135808717
0.5000241236032104
0.5000686271774946
0.5000423938387621
0.5000173778235565
0.500034696739097
0.5000248884444599
0.5000211969193811
0.5000171306500197
0.5000185999825623
0.5000837476355926
0.5000312416944913
0.5000211969193811
0.5000348514883226
0.5000141312795874
0.5000604055206563
0.5000628663328791
0.5000445799553223
0.5000852755243952
0.5000152778750807
0.5000963258189914
0.5000382930234045
0.5000927819287483
0.5000866594436802
0.5000565251183494
0.5000435404342555
0.5000519820716736
0.5000908410843488
0.5000088050800676
0.5000353281989685
0.5000537202196648
0.5000668624018032
0.5000134998197159
0.5000530751299537
0.5000461404940925
0.5000654625242992
0.50007945937135
0.5000256694210545
0.5000494594785558
0.500075484173518
0.5000211969193811
0.5000442804806599
0.5000434148965288
0.500062067862224
0.500060544860459
0.5000494594785558


0.5000423938387621
0.5000523861623851
0.5001424058541537
0.5000576181766295
0.500039347941078
0.5000170579634168
0.5000735830817662
0.5000364212572485
0.5000778578716136
0.5
0.5000635907581431
0.5000372148408662
0.5000570129652204
0.5000157613943568
0.5000635907581431
0.5000278584684632
0.5000604728599455
0.5000523861623851
0.5000353281989685
0.5000306363747122
0.5000562149128883
0.5000569709548993
0.5000513312447118
0.5000370515699823
0.5000494594785558
0.5000211969193811
0.5000211969193811
0.5000944874614125
0.5000626576256145
0.50005981417056
0.5000252951679302
0.5000364212572485
0.5000736060036911
0.5000392475107485
0.500034582259528
0.5000312273836108
0.5000529041630154
0.5000307204362828
0.5000282625591748
0.5000473541873066
0.5000141312795874
0.5000343517792664
0.5000403773771284
0.5000404674126204
0.5000594518021788
0.5000364212572485
0.5000211969193811
0.5000099923236231
0.5000330610091602
0.5000553128462145
0.5000725692512668
0.5000282625591748
0.500036439094887
0.50004509133

0.5000852755243952
0.5000640786050141
0.500023173624182
0.5000357740355182
0.5000115343427686
0.5000089522816915
0.5000159868075017
0.5000783245343511
0.5000681989848802
0.5000686758162683
0.5001271815162863
0.5001007907232676
0.5000820867396707
0.5000236950002869
0.5000263345302433
0.500101845640941
0.5000211969193811
0.5000877143613536
0.5000241236032104
0.5000416478993217
0.5000491593756652
0.5000434868970421
0.5000918533173179
0.5000724961338371
0.5000989189571116
0.5000141312795874
0.5000453205225914
0.5000227778245321
0.5000706563979368
0.5000353281989685
0.5000264788164749
0.5000412698474985
0.500024353357512
0.5000434868970421
0.5
0.5000434868970421
0.5000660453187925
0.500040504629818
0.5000199846472461
0.5000256694210545
0.5000353281989685
0.5000406693206106
0.5000676105002527
0.5000453205225914
0.5001081369726211
0.5000707497801182
0.5000709899436462
0.5000424468551106
0.5000434868970421
0.500043826091291
0.5000759618697607
0.5
0.5000455697713755
0.5
0.5000282625591748
0.500

0.5000423938387621
0.500034582259528
0.5000632919755033
0.5000565251183494
0.5000377714356893
0.5000981945318476
0.5000738323305502
0.5000308730371804
0.5000553128462145
0.5000482472064208
0.5000392086012753
0.5000494594785558
0.5000353281989685
0.5000576181766295
0.5000081586980737
0.5000352685920411
0.5000493776882887
0.5000322823012843
0.5001095721859794
0.5000430117205623
0.5000353281989685
0.5000357936563714
0.5000422746249072
0.50011353808357
0.5000495812637005
0.5001059845969053
0.5000946013008399
0.5000441704924394
0.5
0.5000736112128126
0.5000401980005807
0.5000494594785558
0.500039347941078
0.5000800382462871
0.5000584117602472
0.5000951135468564
0.5000623784860082
0.5000565251183494
0.5000222899776611
0.5000211969193811
0.5000295821302702
0.5000827181995421
0.5000241236032104
0.5000863685826752
0.5000696163560052
0.5000287187854122
0.49999813953595884
0.5000777220377305
0.5000192387078057
0.5000467481503703
0.5000293556174548
0.5000081586980737
0.500044946269467
0.5000282625

0.500061252328975
0.5000594518021788
0.5000423938387621
0.500090641045183
0.5001109807587168
0.5000511738902503
0.5000256694210545
0.5000293556174548
0.5000452341991373
0.5000816865248192
0.500021941291337
0.500060997620023
0.49999958380757026
0.5001063181426145
0.5001089112807346
0.5000565251183494
0.500030304649401
0.5000286742992659
0.5000124067614359
0.5000694822664087
0.5000514097426831
0.5000423938387621
0.5000510403837068
0.5000783815317517
0.5000211969193811
0.5000806487215599
0.5000152243378674
0.5000141312795874
0.5000398007006419
0.5000501455152759
0.5000532346241925
0.5001106104750821
0.5000170579634168
0.5000211969193811
0.5000332830607551
0.5000353281989685
0.5000322823012843
0.5000364212572485
0.5000282625591748
0.5000774639451611
0.5000284478545843
0.5000959217282798
0.5000364212572485
0.5000511738902503
0.5000468663404356
0.5000646838164232
0.5000089540114598
0.5000341159268334
0.5000152243378674
0.5000725281640929
0.5000553128462145
0.5000244043322766
0.50001413127958

0.5000513312447118
0.5000725811804414
0.5000729724967663
0.5000443854931561
0.5000226528758861
0.500043019184101
0.5000461941881144
0.5000423938387621
0.5000544556403673
0.5001059845969053
0.5000735830817662
0.5000706563979368
0.5000717494562169
0.50006747116045
0.5000423938387621
0.500087738909676
0.5000040197421095
0.5000725281640929
0.5000336163585638
0.5000275166197343
0.5000464135808717
0.5000170579634168
0.5000282625591748
0.5000644121507234
0.5000544556403673
0.5000293556174548
0.5000621436946515
0.5000211969193811
0.5000988298856236
0.500054800600198
0.5000706563979368
0.5000433572750183
0.5000382548827977
0.5000434868970421
0.5000358823367819
0.5000629639123152
0.5000159811133853
0.5000282625591748
0.5000358160458394
0.500101845640941
0.5000419695784524
0.5000742730014275
0.5000434868970421
0.5000581060235005
0.5000375143155286
0.5000327350608482
0.5000835754053893
0.5000828348381201
0.5000922219503341
0.5000473900005736
0.5000490865088356
0.5000312273836108
0.5000437200124219

0.5000866632423789
0.5000222899776611
0.5000706563979368
0.5000168792631093
0.5000141312795874
0.5000458944845642
0.5000364212572485
0.5000646838164232
0.5000892601791977
0.5000273655410875
0.5001089112807346
0.5000765097655956
0.5000124067614359
0.5000565251183494
0.5000275509419377
0.5000958730594274
0.5000731551816306
0.5000319815594212
0.5000600742703237
0.5000353281989685
0.5000152243378674
0.5000607197655419
0.5000353281989685
0.5000696441649677
0.5000761991418113
0.500047188194256
0.5000270502870398
0.5000141312795874
0.500010842383126
0.5000494594785558


KeyboardInterrupt: 

In [346]:
import scipy

scipy.log2(2)

1.0

In [351]:
A2.data = np.log2(A2.data)

AttributeError: data not found

In [350]:
A2[A2.nonzero()]

<1x75438 sparse matrix of type '<class 'numpy.float64'>'
	with 75438 stored elements in Dictionary Of Keys format>

In [347]:
scipy.log2()

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

In [360]:
A2 = sp.dok_matrix(A.shape)
A2.data = A.data / sum_adj[A.nonzero()]

A2.data = np.multiply(A2.data,np.log2(A2.data))

In [361]:
A2

<70765x70765 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Dictionary Of Keys format>

In [339]:
%%time
(1.-paired_distances(M[2], sp.csr_matrix( (len(termid), len(termid)) ), metric='cosine')).sum()

CPU times: user 2.67 ms, sys: 73 µs, total: 2.74 ms
Wall time: 2.03 ms


11891.0

In [340]:
%%time
1.-paired_distances(A2, Adj_doc, metric='cosine')

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 15 ms


array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5])

In [272]:
%%time

A_norm = sp.dok_matrix(A.shape)
A_norm[A.nonzero()] = A[A.nonzero()] / A[A.nonzero()].sum(axis=0)

B_norm = sp.dok_matrix(B.shape)
B_norm[B.nonzero()] = B[B.nonzero()] / B[B.nonzero()].sum(axis=0)

AB = (A_norm*B_norm).sum(axis=0)
AA = np.sum(np.sqrt( A_norm*A_norm ), axis=0)
BB = np.sum(np.sqrt( B_norm*B_norm ), axis=0)
AA_BB = np.multiply(AA,BB)

cos = sp.dok_matrix(AB.shape)
cos[AA_BB.nonzero()] = AB[AA_BB.nonzero()] / AA_BB[AA_BB.nonzero()]

#del AA_BB
#del AB
#del A_norm
#del B_norm


CPU times: user 136 ms, sys: 0 ns, total: 136 ms
Wall time: 135 ms


In [273]:
cos.shape, AA.shape, BB.shape, AA_BB.shape, AB.shape

((1, 70765), (1, 70765), (1, 70765), (1, 70765), (1, 70765))

In [248]:
from scipy import spatial as spt

In [293]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import paired_cosine_distances

In [292]:
paired_distances(A_norm[0],B_norm[0],metric='cosine')

array([0.5])

In [301]:
sum_M[0]

<1x12046 sparse matrix of type '<class 'numpy.float64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [307]:
paired_cosine_distances(M[0], M[1])

array([0.81229475, 0.76429774, 0.22436847, ..., 0.5       , 0.5       ,
       0.5       ])

In [7]:
df = Counter()
df_coo = Counter()
list(map(df.update, [ df.update(list(G.nodes)) for G in Gs ]))
termid = { t:i for (i,t) in enumerate(df.keys()) }
idterm = list(zip(*sorted(termid.items(), key=lambda x: x[1])))[0]

cf = Counter(scores)
idclass = list(cf.keys())
classid = { c:i for (i,c) in enumerate(idclass) }

M = [ sp.csr_matrix((len(termid),len(termid)), dtype=np.float64) for _ in idclass ]

In [8]:
all_sparse_M = []
idx_total = []

for i,G in tqdm_notebook(enumerate(Gs), total=len(Gs)):
    y = classid[scores[i]]
    nodelist = list(G.nodes)
    if len(nodelist) == 0:
        continue
    M_adj = nx.to_scipy_sparse_matrix(G, weight=1, nodelist=nodelist)
    normM_adj = M_adj / M_adj.sum(axis=0)
    
    # one hop distance
    normM_adj = (normM_adj + normM_adj.dot(normM_adj))/2
    normM_adj[range(len(normM_adj)), range(len(normM_adj))] = 0.
    normM_adj = normM_adj / normM_adj.sum(axis=0)

    # build mapper term on G to Class graph 
    doc_subtermid = np.array([ termid[t] for t in nodelist ])
    idx_x, idx_y = normM_adj.nonzero()
    
    M_to_append = sp.csr_matrix((len(termid),len(termid)), dtype=np.float64)
    M_to_append[doc_subtermid[idx_x],doc_subtermid[idx_y]] = normM_adj[normM_adj.nonzero()]
    
    # update cooccurence document frequency
    df_coo.update( zip(doc_subtermid[idx_x],doc_subtermid[idx_y]) )
    
    # append 
    all_sparse_M.append(M_to_append)
    idx_total.append( doc_subtermid )
    
    M[y] += M_to_append

  return np.true_divide(self.todense(), other)
  from ipykernel import kernelapp as app





In [9]:
cooid = {}
idcoo = []
df_coo_list = list(df_coo.keys())
for (i,(s,t)) in tqdm_notebook(enumerate(df_coo_list), total=len(df_coo_list)):
    if (s,t) not in cooid:
        cooid[(s,t)] = i
        cooid[(t,s)] = i
        idcoo.append( (s,t) )




In [10]:
COO = [ sp.csr_matrix((len(idcoo),len(idcoo)), dtype=np.float64) for _ in idclass ]

In [30]:
def graph2line_graph(cooid, N):
    def inner_g2lg(G):
        list_of_pairs = list(zip(*G.nonzero()))
        for (source,target) in list_of_pairs:
            line_graph_id = cooid[(source,target)]

            coo = sp.csr_matrix((N,N), dtype=np.float64)

            S,T = normM_adj[source,:].nonzero()
            S = np.array( [ source for _ in S ] )
            coo[ ([line_graph_id]*len(S), [ cooid[(s,t)] for (s,t) in zip(S,T) ]) ] = 1

            S,T = normM_adj[:,target].nonzero()
            T = np.array( [ target for _ in T ] )
            coo[ ([line_graph_id]*len(T), [ cooid[(s,t)] for (s,t) in zip(S,T) ]) ] = 1

        return coo
    return inner_g2lg
def inner_g2lg(G):
    list_of_pairs = list(zip(*G.nonzero()))
    for (source,target) in list_of_pairs:
        line_graph_id = cooid[(source,target)]

        coo = sp.csr_matrix((N,N), dtype=np.float64)

        S,T = normM_adj[source,:].nonzero()
        S = np.array( [ source for _ in S ] )
        coo[ ([line_graph_id]*len(S), [ cooid[(s,t)] for (s,t) in zip(S,T) ]) ] = 1

        S,T = normM_adj[:,target].nonzero()
        T = np.array( [ target for _ in T ] )
        coo[ ([line_graph_id]*len(T), [ cooid[(s,t)] for (s,t) in zip(S,T) ]) ] = 1

    return coo

In [28]:
from multiprocessing import Pool

In [31]:
N = len(idcoo)
with Pool(processes=8) as pool:
    for coo_m in pool.imap(inner_g2lg, tqdm_notebook(all_sparse_M, total=len(all_sparse_M), smoothing=0, position=0)):
        coo_all += coo_m

Process ForkPoolWorker-28:
Process ForkPoolWorker-32:
Process ForkPoolWorker-30:
Process ForkPoolWorker-31:
Process ForkPoolWorker-25:
Process ForkPoolWorker-29:
Process ForkPoolWorker-27:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-26:
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib

KeyboardInterrupt: 

In [24]:
i = 0
normM_adj = all_sparse_M[i]
y = classid[scores[i]]

coo_all = sp.csr_matrix((len(idcoo),len(idcoo)), dtype=np.float64)

for coo_m in map(graph2line_graph(cooid, len(idcoo)), tqdm_notebook(all_sparse_M, total=len(all_sparse_M), smoothing=0, position=0)):
    coo_all += coo_m

  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app


KeyboardInterrupt: 

In [70]:
([line_graph_id]*len(S), [ cooid[(s,t)] for (s,t) in zip(S,T) ])

([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

In [55]:
coo[ ([line_graph_id]*len(T), [ cooid[(s,t)] for (s,t) in zip(S,T) ]) ]

KeyError: (0, 0)

In [43]:
normM_adj[:,target].nonzero()

(array([57, 58, 60], dtype=int32), array([0, 0, 0], dtype=int32))

In [31]:
for i,normM_adj in tqdm_notebook(enumerate(all_sparse_M), total=len(all_sparse_M)):
    y = classid[scores[i]]
    id_toadd = list(zip(*normM_adj.nonzero()))
    
    COO[y][[cooid[xy] for xy in id_toadd]] += 1
    
    M_adj = nx.to_scipy_sparse_matrix(G, weight=1, nodelist=nodelist)
    normM_adj = M_adj / M_adj.sum(axis=0)
    
    # one hop distance
    normM_adj = (normM_adj + normM_adj.dot(normM_adj))/2
    normM_adj[range(len(normM_adj)), range(len(normM_adj))] = 0.
    normM_adj = normM_adj / normM_adj.sum(axis=0)

    # build mapper term on G to Class graph 
    doc_subtermid = np.array([ termid[t] for t in nodelist ])
    idx_x, idx_y = normM_adj.nonzero()
    
    M_to_append = sp.csr_matrix((len(termid),len(termid)), dtype=np.float64)
    M_to_append[doc_subtermid[idx_x],doc_subtermid[idx_y]] = normM_adj[normM_adj.nonzero()]
    
    # update cooccurence document frequency
    df_coo.update( zip(doc_subtermid[idx_x],doc_subtermid[idx_y]) )
    
    # append 
    all_sparse_M.append(M_to_append)
    idx_total.append( doc_subtermid )
    
    M[y] += M_to_append

[((0, 1), 0),
 ((0, 2), 1),
 ((0, 5), 2),
 ((0, 12), 3),
 ((0, 13), 4),
 ((0, 14), 5),
 ((0, 15), 6),
 ((0, 16), 7),
 ((0, 17), 8),
 ((0, 18), 9)]

In [104]:
all_sparse_M = []
idx_total = []

for G in tqdm(nGs, total=len(Gs)):
    nodelist = list(G.nodes)
    if len(nodelist) == 0:
        continue
    M_adj = nx.to_scipy_sparse_matrix(G.to_undirected(), weight=1, nodelist=nodelist)
    normM_adj = M_adj / M_adj.sum(axis=0)
    normM_adj = (normM_adj + normM_adj.dot(normM_adj))/2
    normM_adj[range(len(normM_adj)), range(len(normM_adj))] = 0.
    normM_adj = normM_adj / normM_adj.sum(axis=0)

    doc_subtermid = np.array([ termid[t] for t in nodelist ])
    idx_x, idx_y = normM_adj.nonzero()
    
    M_to_append = sp.csr_matrix((len(termid),len(termid)), dtype=np.float64)
    M_to_append[doc_subtermid[idx_x],doc_subtermid[idx_y]] = normM_adj[normM_adj.nonzero()]
    
    df_coo.update( zip(doc_subtermid[idx_x],doc_subtermid[idx_y]) )
    
    all_sparse_M.append(M_to_append)
    idx_total.append( doc_subtermid )













  0%|          | 16/8199 [00:00<00:52, 154.47it/s][A[A[A[A[A[A





  0%|          | 31/8199 [00:00<00:54, 149.34it/s][A[A[A[A[A[A





  1%|          | 49/8199 [00:00<00:52, 154.67it/s][A[A[A[A[A[A





  1%|          | 63/8199 [00:00<00:54, 149.04it/s][A[A[A[A[A[A





  1%|          | 77/8199 [00:00<00:55, 145.65it/s][A[A[A[A[A[A





  1%|          | 97/8199 [00:00<00:51, 158.24it/s][A[A[A[A[A[A





  1%|▏         | 117/8199 [00:00<00:49, 164.42it/s][A[A[A[A[A[A





  2%|▏         | 136/8199 [00:00<00:47, 170.19it/s][A[A[A[A[A[A





  2%|▏         | 157/8199 [00:00<00:44, 179.80it/s][A[A[A[A[A[A





  2%|▏         | 175/8199 [00:01<00:45, 177.76it/s][A[A[A[A[A[A





  2%|▏         | 193/8199 [00:01<00:48, 164.49it/s][A[A[A[A[A[A





  3%|▎         | 210/8199 [00:01<00:48, 164.54it/s][A[A[A[A[A[A





  3%|▎         | 230/8199 [00:01<00:45, 173.76it/s][A[A[A[A[A[A





  3%|▎         | 25

 38%|███▊      | 3114/8199 [00:28<01:05, 77.80it/s][A[A[A[A[A[A





 38%|███▊      | 3128/8199 [00:28<00:57, 87.97it/s][A[A[A[A[A[A





 38%|███▊      | 3139/8199 [00:28<00:57, 87.73it/s][A[A[A[A[A[A





 38%|███▊      | 3149/8199 [00:29<01:16, 66.13it/s][A[A[A[A[A[A





 39%|███▊      | 3161/8199 [00:29<01:09, 71.98it/s][A[A[A[A[A[A





 39%|███▊      | 3170/8199 [00:29<01:22, 61.22it/s][A[A[A[A[A[A





 39%|███▉      | 3178/8199 [00:29<01:25, 58.54it/s][A[A[A[A[A[A





 39%|███▉      | 3185/8199 [00:29<01:26, 57.71it/s][A[A[A[A[A[A





 39%|███▉      | 3193/8199 [00:29<01:20, 62.19it/s][A[A[A[A[A[A





 39%|███▉      | 3204/8199 [00:29<01:10, 70.52it/s][A[A[A[A[A[A





 39%|███▉      | 3212/8199 [00:30<01:41, 49.14it/s][A[A[A[A[A[A





 39%|███▉      | 3224/8199 [00:30<01:23, 59.48it/s][A[A[A[A[A[A





 39%|███▉      | 3235/8199 [00:30<01:12, 68.66it/s][A[A[A[A[A[A





 40%|███▉      | 3244/819

 70%|███████   | 5753/8199 [00:58<00:26, 94.02it/s] [A[A[A[A[A[A





 70%|███████   | 5768/8199 [00:58<00:23, 104.55it/s][A[A[A[A[A[A





 71%|███████   | 5783/8199 [00:58<00:21, 112.61it/s][A[A[A[A[A[A





 71%|███████   | 5795/8199 [00:59<00:24, 99.88it/s] [A[A[A[A[A[A





 71%|███████   | 5806/8199 [00:59<00:23, 100.90it/s][A[A[A[A[A[A





 71%|███████   | 5822/8199 [00:59<00:20, 113.39it/s][A[A[A[A[A[A





 71%|███████   | 5837/8199 [00:59<00:19, 121.71it/s][A[A[A[A[A[A





 71%|███████▏  | 5850/8199 [00:59<00:21, 110.38it/s][A[A[A[A[A[A





 71%|███████▏  | 5862/8199 [00:59<00:21, 106.71it/s][A[A[A[A[A[A





 72%|███████▏  | 5874/8199 [00:59<00:24, 95.32it/s] [A[A[A[A[A[A





 72%|███████▏  | 5885/8199 [00:59<00:23, 96.77it/s][A[A[A[A[A[A





 72%|███████▏  | 5896/8199 [01:00<00:23, 98.82it/s][A[A[A[A[A[A





 72%|███████▏  | 5912/8199 [01:00<00:20, 111.43it/s][A[A[A[A[A[A





 72%|███████▏ 

In [103]:
normM_adj[range(len(normM_adj)), range(len(normM_adj))] = 0.

In [99]:
[(idterm[i],idterm[j],v) for ((i,j),v) in sorted( df_coo.items(), key=lambda x: x[1], reverse=True )[:100] if i < j]

[('comput', 'scienc', 3997),
 ('depart', 'comput', 2586),
 ('depart', 'scienc', 2557),
 ('page', 'home', 2406),
 ('univers', 'scienc', 1993),
 ('univers', 'comput', 1504),
 ('interest', 'research', 1164),
 ('system', 'comput', 1106),
 ('depart', 'univers', 1049)]

In [107]:
for i in range(100000):
    normM_adj+normM_adj

In [26]:
M = sp.csr_matrix((sum([ v for v in termid.values() ]),len(termid)), dtype=np.float64)
for ((slide_doc, _), M_to_append) in tqdm(list(zip( idx_total, all_sparse_M )), total=len(all_sparse_M)):
    M[slide_doc[0]:slide_doc[1], :] = M_to_append









  0%|          | 1/8190 [00:01<2:41:54,  1.19s/it][A[A[A[A



  0%|          | 2/8190 [00:02<2:33:55,  1.13s/it][A[A[A[A



  0%|          | 3/8190 [00:03<2:29:37,  1.10s/it][A[A[A[A



  0%|          | 4/8190 [00:04<2:25:58,  1.07s/it][A[A[A[A

KeyboardInterrupt: 

In [31]:
M = sp.csr_matrix(np.array(all_sparse_M))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

In [17]:
M[idx_total[0][0][0]:idx_total[0][0][1], :] = all_sparse_M[0]

In [72]:



for i, G in tqdm(enumerate(Gs), total=len(Gs)):
    nodelist = list(G.nodes)
    if len(nodelist) < 1:
        continue
    M_adj = nx.to_scipy_sparse_matrix(G.to_undirected(), weight=1, nodelist=nodelist)
    normM_adj = M_adj / M_adj.sum(axis=0)
    normM_adj = (normM_adj + normM_adj.dot(normM_adj))/2

    doc_subtermid = [ termid[t] for t in nodelist ]
    slide_doc = (M.shape[0],(M.shape[0]+normM_adj.shape[0]))

    #M = sp.vstack( (M, sp.csr_matrix( (normM_adj.shape[0],len(termid)), dtype=np.float64)))
    M[slice(*slide_doc), doc_subtermid] = normM_adj


  0%|          | 0/8199 [00:00<?, ?it/s][A

ValueError: shape mismatch: objects cannot be broadcast to a single shape

In [65]:
M.shape

(67, 16887)

In [54]:
all_df = []
all_df_join = pd.DataFrame()
for i, G in tqdm(enumerate(Gs), total=len(Gs)):
    nodelist = G.nodes
    df_adj = nx.to_pandas_adjacency(G.to_undirected(), weight=1, nodelist=nodelist)
    values = df_adj.values
    values = values/values.sum(axis=1)
    df_adj = pd.DataFrame( (values + np.dot(values,values))/2, columns=df_adj.columns, index=df_adj.index ).T.to_sparse(fill_value=0.)
    df_adj['<docid>'] = int(i)
    df_adj['<label>'] = scores[i]
    df_adj['<term>'] = df_adj.index
    df_adj = df_adj.set_index(['<label>', '<docid>', '<term>'])
    all_df_join = pd.concat([all_df_join, df_adj], sort=False).fillna(0.).to_sparse(fill_value=0.)

  import sys
  2%|▏         | 210/10662 [02:23<4:01:47,  1.39s/it]

KeyboardInterrupt: 

In [52]:
all_df[10562]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,leg,fall,spoof,flat,freak
<label>,<docid>,<term>,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,10562,leg,0.25,0.25,0.0,0.0,0.5
-1,10562,fall,0.125,0.25,0.125,0.25,0.25
-1,10562,spoof,0.0,0.25,0.25,0.5,0.0
-1,10562,flat,0.0,0.25,0.25,0.375,0.125
-1,10562,freak,0.25,0.25,0.0,0.125,0.375


In [53]:
all_df_join = pd.concat(all_df, sort=False).fillna(0.).to_sparse(fill_value=0.)

KeyboardInterrupt: 

In [44]:
dir(df_adj)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '_

In [8]:
all_terms = {}
for docid,G in tqdm(enumerate(nGs), total=len(Gs)):
    for v in G.nodes:
        if v in all_terms:
            LG_v, doc_list, class_count = all_terms[v]
        else:
            doc_list = []
            LG_v = nx.Graph()
            class_count = Counter()
            
        doc_list.append(docid)
        class_count.update([scores[docid]])
        
        all_edges = list(G.edges(v, data=True))
        for i,(s1,t1, att1) in enumerate(all_edges):
            if (s1,t1) not in LG_v.nodes:
                LG_v.add_node( (s1,t1), count=0, labels=dict() )
            LG_v.node[(s1,t1)]['count'] += 1
            if scores[docid] not in LG_v.node[(s1,t1)]['labels']:
                LG_v.node[(s1,t1)]['labels'][scores[docid]] = set([docid])
            else:
                LG_v.node[(s1,t1)]['labels'][scores[docid]].add(docid)
            for (s2,t2, att2) in all_edges[(i+1):]:
                if (s2,t2) not in LG_v.nodes:
                    LG_v.add_node( (s2,t2), count=0, labels=dict() )
                if ( (s1,t1),(s2,t2) ) not in LG_v.edges:
                    LG_v.add_edge( (s1,t1),(s2,t2), count=0, sum=0 )
                LG_v.edges[(s1,t1),(s2,t2)]['count'] += 1
                LG_v.edges[(s1,t1),(s2,t2)]['sum']   += abs( att1['weight']-att2['weight'] )
        all_terms[v] = (LG_v, doc_list, class_count)
for v, (G, doc_list, class_count) in tqdm(all_terms.items(), total=len(all_terms)):
    to_remove = [ (s,t) for s,t,att in G.edges(data=True) if att['count'] < 2 ]
    for s,t in to_remove:
        G.remove_edge(s,t)
        
    to_remove = [ v for v in G.nodes if G.degree(v) < 1 ]
    for s,t in to_remove:
        G.remove_node((s,t))

100%|██████████| 10662/10662 [00:04<00:00, 2178.68it/s]
100%|██████████| 14156/14156 [00:00<00:00, 16980.07it/s]


In [9]:
for v, (G, doc_list, class_count) in tqdm(all_terms.items(), total=len(all_terms)):
    to_remove = [ (s,t) for s,t,att in G.edges(data=True) if att['count'] < 2 ]
    if len(to_remove) > 0:
        print(v) 

100%|██████████| 14156/14156 [00:00<00:00, 230687.45it/s]


In [10]:
biggest = max(all_terms.items(), key=lambda x: len(x[1][1]))[0]
smallest = min(all_terms.items(), key=lambda x: len(x[1][1]))[0]

In [11]:
sorted(list(all_terms[biggest][0].nodes(data=True)), key=lambda x: x[1]['count'], reverse=True)[:5]

[(('film', "n't"),
  {'count': 43,
   'labels': {-1: {5368,
     5858,
     6019,
     6039,
     6134,
     6137,
     6419,
     6646,
     6678,
     6738,
     7197,
     7249,
     7496,
     7497,
     8029,
     8896,
     9105,
     9190,
     9531,
     9551,
     9577,
     9622,
     9760,
     9864,
     10306,
     10615,
     10618},
    1: {515,
     969,
     1426,
     1740,
     2032,
     2138,
     2542,
     3367,
     3513,
     3646,
     4062,
     5109,
     5150,
     5169,
     5181,
     5282}}}),
 (('film', 'make'),
  {'count': 39,
   'labels': {-1: {5367,
     5671,
     5729,
     5740,
     6192,
     6542,
     6596,
     6791,
     7073,
     7222,
     7280,
     7316,
     7998,
     8097,
     8208,
     8251,
     8906,
     8938,
     9194,
     9415,
     9588,
     9741,
     10068},
    1: {44,
     272,
     574,
     939,
     1377,
     1482,
     1546,
     1747,
     2014,
     2132,
     2138,
     3248,
     3276,
     3657,
     4072,
 

In [12]:
sorted(list(all_terms[biggest][0].edges(data=True)), key=lambda x: x[2]['count'], reverse=True)[:10]

[(('film', 'sundanc'), ('film', 'festiv'), {'count': 3, 'sum': 0.0}),
 (('film', 'famili'), ('film', 'enjoy'), {'count': 2, 'sum': 0.0}),
 (('film', 'win'), ('film', 'credit'), {'count': 2, 'sum': 0.0}),
 (('film', 'special'), ('film', 'make'), {'count': 2, 'sum': 0.0}),
 (('film', 'bad'), ('film', "n't"), {'count': 2, 'sum': 0.0}),
 (('film', 'worst'), ('film', 'year'), {'count': 2, 'sum': 0.0}),
 (('film', 'debut'), ('film', 'director'), {'count': 2, 'sum': 0.0}),
 (('film', "n't"), ('film', 'fact'), {'count': 2, 'sum': 0.0}),
 (('film', 'cann'), ('film', 'festiv'), {'count': 2, 'sum': 0.0}),
 (('film', 'french'), ('film', 'industri'), {'count': 2, 'sum': 0.0})]

In [93]:
def MutualInformation(v1_obj, v2_obj):
    union_docs = set.union(*v1_obj['labels'].values(), *v2_obj['labels'].values())
    ndocs = len(union_docs)

    # entropy phase
    ndocs_v1 = v1_obj['count']
    #probs_v1 = { c: 1.*len(v1_obj['labels'][c])/ndocs_v1 for c in v1_obj['labels'] }
    probs_v1 = { c: 1.*len(v1_obj['labels'][c])/ndocs for c in v1_obj['labels'] }
    entropy_v1 = -sum( [ v*np.log2(v) for v in probs_v1.values() ] )

    ndocs_v2 = v2_obj['count']
    #probs_v2 = { c: 1.*len(v2_obj['labels'][c])/ndocs_v2 for c in v2_obj['labels'] }
    probs_v2 = { c: 1.*len(v2_obj['labels'][c])/ndocs for c in v2_obj['labels'] }
    entropy_v2 = -sum( [ v*np.log2(v) for v in probs_v2.values() ] )

    # mutual info phase
    union_class = set(v1_obj['labels']).union(v2_obj['labels'])

    conj_entr = 0.
    values = []
    for c1 in union_class:
        labels_v1 = v1_obj['labels'][c1] if c1 in v1_obj['labels'] else set()
        for c2 in union_class:
            labels_v2 = v2_obj['labels'][c2] if c2 in v2_obj['labels'] else set()
            if c1 != c2:
                prob = (len(labels_v1)/ndocs) * (len(labels_v2)/ndocs)
            else:
                intersection_docs = labels_v1.intersection(labels_v2)
                #prob = (len(labels_v1) + len(labels_v2) - len(intersection_docs))/ndocs

                #P(t1,t2) = P(t1)                   + P(t2)                 - P(t1 cap t2)
                #prob     = len(labels_v1)/ndocs_v1 + len(labels_v2)/ndocs_v2 - len(intersection_docs)/ndocs

                #P(t1,t2) = P(t1)                   * P(t2|t1)
                prob      = len(labels_v1)/ndocs * len(intersection_docs)/ndocs_v1

                #P(t1,t2) = P(t2)                   * P(t1|t2)
                #prob      = len(labels_v2)/ndocs * len(intersection_docs)/ndocs_v2

                #P(t1,t2) = P(t2 cap t1)
                #prob      = len(intersection_docs)/ndocs
            if prob > 0.:
                conj_entr -= prob*np.log2(prob)
    MI = entropy_v1 + entropy_v2 - conj_entr
    if abs(entropy_v1+entropy_v2) < 0.000001 or abs(entropy_v1*entropy_v2) < 0.000001:
        return 0., MI, entropy_v1, entropy_v2, conj_entr
    nMI = 2.*MI/(entropy_v1+entropy_v2)
    #nMI = MI/np.sqrt(entropy_v1*entropy_v2)
    return nMI, MI, entropy_v1, entropy_v2, conj_entr

In [94]:
def sig(z):
    return 1. / (1. + np.exp( -z ))

In [101]:
G = all_terms[biggest][0]
v1 = ('film', 'movi')
v2 = ('film', "perform")
nMI_max = 0.
for v1,v2,_ in sorted(list(G.edges(data=True)), key=lambda x: x[2]['count'], reverse=False)[:10]:
    v1_obj = G.node[v1]
    v2_obj = G.node[v2]
    
    nMI, MI, entropy_v1, entropy_v2, conj_entr = MutualInformation(v1_obj, v2_obj)
    nMI_max = max(abs(MI), nMI_max)
    all_terms[biggest][0].edges[(v1, v2)]['sigNMI'] = sig(nMI)
    print(v1,v2)
    print('\t sigNMI=%.4f NMI=%.4f MI=%.4f H(t1)=%.4f H(t2)=%.4f H(t1,t2)=%.4f' % (sig(nMI), nMI, MI, entropy_v1, entropy_v2, conj_entr))

('comput', 'address') ('comput', 'affili')
	 sigNMI=0.7626 NMI=1.1668 MI=1.4900 H(t1)=1.6558 H(t2)=0.8981 H(t1,t2)=1.0640
('comput', 'address') ('comput', 'linguist')
	 sigNMI=0.7730 NMI=1.2251 MI=1.9134 H(t1)=1.2049 H(t2)=1.9188 H(t1,t2)=1.2103
('comput', 'address') ('comput', 'scientist')
	 sigNMI=0.7808 NMI=1.2703 MI=1.9667 H(t1)=1.0850 H(t2)=2.0115 H(t1,t2)=1.1297
('comput', 'address') ('comput', 'vision')
	 sigNMI=0.8139 NMI=1.4754 MI=2.1096 H(t1)=0.7045 H(t2)=2.1552 H(t1,t2)=0.7501
('comput', 'address') ('comput', 'societi')
	 sigNMI=0.7952 NMI=1.3566 MI=1.7943 H(t1)=0.8836 H(t2)=1.7617 H(t1,t2)=0.8510
('comput', 'address') ('comput', 'electr')
	 sigNMI=0.7850 NMI=1.2948 MI=1.8621 H(t1)=1.0386 H(t2)=1.8378 H(t1,t2)=1.0142
('comput', 'address') ('comput', 'transact')
	 sigNMI=0.7721 NMI=1.2199 MI=1.6210 H(t1)=1.2111 H(t2)=1.4465 H(t1,t2)=1.0365
('comput', 'address') ('comput', 'parallel')
	 sigNMI=0.8340 NMI=1.6142 MI=2.2003 H(t1)=0.4938 H(t2)=2.2323 H(t1,t2)=0.5258
('comput', 'ad

In [44]:
G = all_terms[biggest][0]
nodelist = list(G.nodes)
M_biggest = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight='sigNMI')
M_biggest.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
sum_matrix = M_biggest.sum(axis=1)
sum_matrix

matrix([[ 22.39275003],
        [  0.8802576 ],
        [  7.5       ],
        ...,
        [  0.5       ],
        [100.48860071],
        [  1.5       ]])

In [46]:
normM_biggest = M_biggest/sum_matrix
normM_biggest.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
one_hop = np.dot(normM_biggest,normM_biggest)

In [79]:
G = all_terms[biggest][0]
nodelist = list(G.nodes)
M_biggest = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight='sigNMI')
old = M_biggest.copy()
for it in range(10):
    normM_biggest = M_biggest/M_biggest.sum()
    one_hop = np.dot(normM_biggest,normM_biggest)
    list_aval = list(zip(*one_hop.nonzero()))
    M_biggest[one_hop.nonzero()] = [ one_hop[s,t]*sig(MutualInformation(G.node[nodelist[s]], G.node[nodelist[t]])[0]) for (s,t) in tqdm(list_aval, total=len(list_aval), smoothing=0) ]
    diff_sum = np.sum( np.abs( M_biggest - old ) )
    if diff_sum < 0.1:
        break
    old = M_biggest.copy()
normM_biggest = M_biggest/M_biggest.sum()

100%|██████████| 2254376/2254376 [03:19<00:00, 11279.42it/s]
100%|██████████| 2669936/2669936 [03:50<00:00, 11563.02it/s]
100%|██████████| 2669960/2669960 [03:50<00:00, 11601.79it/s]


In [21]:
M_biggest_2 = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight='sigNMI')
for it in range(100):
    M_biggest_2 = (M_biggest_2/M_biggest_2.sum(axis=1) + M_biggest_2/M_biggest_2.sum(axis=0))/2.

In [86]:
normM_biggest = M_biggest/M_biggest.max(axis=1)/2 + M_biggest/M_biggest.max(axis=0)/2
normM_biggest.A

ValueError: inconsistent shapes

In [92]:
M_biggest/M_biggest.max(axis=0).A + M_biggest/M_biggest.max(axis=1).A

matrix([[6.17709377e-09, 2.50920817e-01, 1.56593250e-08, ...,
         8.55954133e-04, 3.55138632e-08, 1.54070866e-08],
        [2.50831964e-01, 1.18893460e+00, 9.64181847e-01, ...,
         5.49580825e-08, 9.74636995e-01, 9.64181846e-01],
        [1.56633432e-08, 9.64195593e-01, 1.51120082e-09, ...,
         1.13498486e-02, 7.17228349e-08, 9.86339899e-10],
        ...,
        [8.55079812e-04, 5.49628078e-08, 1.13496868e-02, ...,
         9.38717929e-01, 3.01706865e-03, 6.64186055e-03],
        [3.54992267e-08, 9.74828434e-01, 7.17003490e-08, ...,
         3.01934241e-03, 8.82094469e-08, 7.09289915e-08],
        [1.54109264e-08, 9.64195593e-01, 9.86346898e-10, ...,
         6.64195524e-03, 7.09527657e-08, 5.00491120e-10]])

In [76]:
Counter(normM_biggest.nonzero()[0])

Counter({0: 1634,
         1: 1634,
         2: 1634,
         3: 1634,
         4: 1634,
         5: 1634,
         6: 1634,
         7: 1634,
         8: 1634,
         9: 1634,
         10: 1634,
         11: 1634,
         12: 1634,
         13: 1634,
         14: 1634,
         15: 1634,
         16: 1634,
         17: 1634,
         18: 1634,
         19: 1634,
         20: 1634,
         21: 1634,
         22: 1634,
         23: 1634,
         24: 1634,
         25: 1634,
         26: 1634,
         27: 1634,
         28: 1634,
         29: 1634,
         30: 1634,
         31: 1634,
         32: 1634,
         33: 1634,
         34: 1634,
         35: 1634,
         36: 1634,
         37: 1634,
         38: 1634,
         39: 1634,
         40: 1634,
         41: 1634,
         42: 1634,
         43: 1634,
         44: 1634,
         45: 1634,
         46: 1634,
         47: 1634,
         48: 1634,
         49: 1634,
         50: 1634,
         51: 1634,
         52: 1634,
   

In [23]:
for (s,t) in zip(*one_hop.nonzero()):
    v1 = nodelist[s]
    v2 = nodelist[t]
    v1_obj = G.node[v1]
    v2_obj = G.node[v2]
    
    nMI, MI, entropy_v1, entropy_v2, conj_entr = MutualInformation(v1_obj, v2_obj)
    M_biggest[s,t] = sig(nMI)

In [None]:
normM_biggest

In [None]:
probs_v1

In [None]:
values = np.array(values)
values_2 = values/values.sum()
values, values_2

In [None]:
conj_entr = -sum([ prob*np.log2(prob) for prob in values_2 ])

In [None]:
1.*len(doc_v1_v2)/len(union_docs)

In [None]:
list_of_valids = [ (g,idx, term) for (term, (g,idx)) in list(all_terms.items()) if len(idx) > 1 ]
sorted_list_of_valids = sorted(list_of_valids, key=lambda x: len(x[0].edges), reverse=True )
#sampled_G = sorted(list_of_valids, key=lambda x: len(x[0].nodes), reverse=True )[0][0]
len(list(list_of_valids))

In [None]:
def compare_jac(subg1, subg2):
    set1 = set(subg1.keys())
    set2 = set(subg2.keys())
    t_intersection = len(set1.intersection(set2))
    t_union = len(set1.union(set2))
    return 1. * t_intersection/(t_union+1)

In [None]:
def build_paths(g):
    seen = dict()
    for node_source,att in g.nodes(data=True):
        list_of_edges = g.edges(node_source, data=True)
        for _,node_target,att_target in list_of_edges:
            if node_source != node_target:
                for _,node_to_compare,att_to_compare in g.edges(node_target, data=True):
                    if node_source != node_to_compare :
                        if (node_source,node_to_compare) not in seen:
                            seen[(node_source,node_to_compare)] = att_target['weight'] + att_to_compare['weight']
                        else:
                            seen[(node_source,node_to_compare)] += att_target['weight'] + att_to_compare['weight']
    return seen

In [None]:
def build_paths_simple(g):
    seen = dict()
    for node_center,att in g.nodes(data=True):
        list_of_edges = list(g.edges(node_center, data=True))
        for i, (_, node_source, att_source) in enumerate(list_of_edges):
            for _, node_target, att_target in list_of_edges[i+1:]:
                if (node_source,node_target) not in seen:
                    seen[(node_source,node_target)] = att_target['weight'] + att_target['weight']
                else:
                    seen[(node_source,node_target)] += att_target['weight'] + att_target['weight']
    return seen

In [None]:
def get_one_hop(x):
    g, term = x
    seen = dict()
    list_of_edges = list(g.edges(term, data=True))
    for i, (_, node_source, att_source) in enumerate(list_of_edges):
        for _, node_target, att_target in list_of_edges[i+1:]:
            if (node_source,node_target) not in seen:
                seen[(node_source,node_target)] = att_target['weight'] + att_target['weight']
            else:
                seen[(node_source,node_target)] += att_target['weight'] + att_target['weight']
    return seen

In [None]:
def build_paths_parallel(g):
    seen = dict()
    with pool.Pool(12) as p:
        result = list(p.imap( get_one_hop, [ (g,term) for term in g.nodes ] ))
    for seen_atual in result:
        for ((node_source,node_target), weight) in seen_atual.items():
            if (node_source,node_target) not in seen:
                seen[(node_source,node_target)] = weight
            else:
                seen[(node_source,node_target)] += weight
            
    return seen

In [None]:
from multiprocessing import pool
from itertools import repeat
from multiprocessing import pool
from multiprocessing.pool import ThreadPool

def co_app(x):
    (((source, target),weight), g) = x
    return ((source, target), weight*compare_jac(g[source],g[target]))

def co_app2(x):
    (((source, neigh_source), (target, neigh_target)),weight) = x
    return ((source, target), weight*compare_jac(neigh_source,neigh_target))

In [None]:
eps = .8
def build_representation_graph(x):
    (g, idx, term) = x
    for s,t,att in g.edges(data=True):
        g.edges[ (s,t) ]['weight'] = att['count']/( g.degree(s) + g.degree(t) + 1 )
    seen = build_paths(g)
    mmax = max(1,int(eps*len(seen)))
    top_seen = sorted( seen.items(), key=lambda x: x[1], reverse=True )[:mmax]

    top_seen = [ (((source,g[source]), (target,g[target])),weight) for ((source, target),weight) in top_seen ]

    best_repr = list(map( co_app2, top_seen))
    final_graph = nx.Graph()
    for ((s,t), weight) in best_repr:
        final_graph.add_edge( s,t, weight=1./(weight+1.) )
    nodeslist=final_graph.nodes
    if len(nodeslist) == 0 or len(final_graph.edges) == 0:
        return term,{0: g}
    M = nx.to_scipy_sparse_matrix(final_graph, weight='weight', nodelist=nodeslist)
    if M.shape[0] == 0 or M.shape[1] == 0:
        return term,{0: g}
    nclusters = max(1,int(np.log(M.shape[0])))
    clustering = KMeans(n_clusters=nclusters, n_jobs=12)
    labels = clustering.fit_predict(M)
    clustered_graph = {}
    for (l, (s,t)) in list(zip(labels,nodeslist)):
        if l not in clustered_graph:
            clustered_graph[l] = nx.Graph()
        clustered_graph[l].add_edge(s,t)
    return term, clustered_graph

In [None]:
import random
shuffled_list_of_valids = sorted_list_of_valids.copy()
random.shuffle(shuffled_list_of_valids)
sorted_list_of_valids = sorted(list_of_valids, key=lambda x: len(x[0].edges))

In [None]:
results = list(map( build_representation_graph, tqdm(shuffled_list_of_valids, smoothing=0) ))

In [None]:
k = np.power(10, range(8))
r = 1.-np.power(np.e, np.log(k) )/ (1.+np.power(np.e, np.log(k) ))
r = (k*np.log(k))/(k*np.sqrt(k))
list(zip(k,k*r, k*0.8, 100.*r/k))

In [None]:
int(1.00000000e+05)

In [None]:
with ThreadPool(12) as p:
    results = list(p.imap( build_representation_graph, tqdm(shuffled_list_of_valids, smoothing=0) ))

In [None]:
def convert_to_mapper(old_dict):
    k = 0
    new_dict = {}
    for (term, rep) in old_dict:
        new_dict[term] = {}
        for idx_cluster, cluster_repr in rep.items():
            new_dict[term][k] = cluster_repr
            k += 1
    return new_dict, k

In [None]:
result_dict = dict(convert_to_mapper(results)[0])

In [None]:
result_dict['good'][3401].nodes

In [None]:
[ list(g.nodes) for g in result_dict['good'].values() ]

In [None]:
list_of_clusters = sorted([ (term,len(cls)) for (term,cls) in results ], key=lambda x: x[1], reverse=True)[:100]

for term,count in list_of_clusters:
    print("###########",term,"###########")
    for c in result_dict[term]:
        print('\t', c)
        print('\t\t',', '.join(sorted(result_dict[term][c].nodes)))

In [None]:
sum([ len(cls) for (term,cls) in results ])

In [None]:
len(list_of_valids)
k = plt.hist( [ len(x[0].nodes) for x in list_of_valids ], log=True, bins=100 )

In [None]:
len(list_of_valids)
k = plt.hist( [ len(x[0].edges) for x in list_of_valids ], log=True, bins=100 )

In [None]:
sampled_G = sorted(list_of_valids, key=lambda x: len(x[0].nodes), reverse=True )[0][0]
for s,t,att in sampled_G.edges(data=True):
    sampled_G.edges[ (s,t) ]['weight'] = 1./att['count']

In [None]:

term_mapper = {}
with pool.Pool(12) as p:
    for (g, idx, term) in sorted_list_of_valids:
        for s,t,att in g.edges(data=True):
            g.edges[ (s,t) ]['weight'] = att['count']/( g.degree(s) + g.degree(t) )
        seen = build_paths(g)
        
        top_seen = sorted( seen.items(), key=lambda x: x[1], reverse=True )[:int((1.-eps)*len(seen))]
        
        top_seen = [ (((source,g[source]), (target,g[target])),weight) for ((source, target),weight) in top_seen ]
        
        best_repr = list(p.imap( co_app2, tqdm_notebook(top_seen, desc='Building representatition (%s)' % term, position=1, total=len(top_seen)) ))
        final_graph = nx.Graph()
        for ((s,t), weight) in tqdm_notebook(best_repr, desc='Building final graph (%s)' % term):
            final_graph.add_edge( s,t, weight=1./(weight+1.) )
        nodeslist=final_graph.nodes
        M = nx.to_scipy_sparse_matrix(final_graph, weight='weight', nodelist=nodeslist)
        clustering = KMeans(n_clusters=int(np.log(M.shape[0])), n_jobs=14)
        labels = clustering.fit_predict(M)
        clustered_graph = {}
        for (l, (s,t)) in list(zip(labels,nodeslist)):
            if l not in clustered_graph:
                clustered_graph[l] = nx.Graph()
            clustered_graph[l].add_edge(s,t)
        term_mapper[term] = clustered_graph

In [None]:
def apply_comp(x):
    return (x, compare_jac(sampled_G[x[0]],sampled_G[x[1]]))

In [None]:
seen = build_paths(sampled_G)
nseen = sorted(seen, key=lambda x: seen[(x[0],x[1])], reverse=True)
with pool.Pool(12) as p:
    best_repr = list(p.imap( apply_comp, tqdm_notebook(nseen, position=1, total=len(nseen)) ))

In [None]:
eps = 0.5
BigG = nx.Graph()
seen = dict()
to_add = []
nodes_sorted = sorted( tqdm_notebook(sampled_G.nodes(data=True)), key=lambda x: np.log(sampled_G.degree(x[0]))/(1+abs(sampled_G.degree(x[0])-np.mean( [ int(sampled_G.degree(neigh)) for _,neigh in sampled_G.edges(x[0]) ] ))), reverse=True )
#nodes_sorted = nodes_sorted[:20]
#nodes_sorted = nodes_sorted[:int((1.-eps)*len(nodes_sorted))]
for node_source,att in tqdm_notebook(nodes_sorted, position=1, total=len(nodes_sorted)):
    list_of_edges = sampled_G.edges(node_source, data=True)
    for _,node_target,att_target in tqdm_notebook(list_of_edges, position=2, total=len(list_of_edges), disable=True):
        if node_source != node_target:
            for _,node_to_compare,att_to_compare in sampled_G.edges(node_target, data=True):
                if node_source != node_to_compare :
                    if (node_source,node_to_compare) not in seen:
                        seen[(node_source,node_to_compare)] = 1.
                    else:
                        seen[(node_source,node_to_compare)] += 1.

In [None]:
final_graph = nx.Graph()
for ((s,t), weight) in tqdm_notebook(best_repr):
    final_graph.add_edge( s,t, weight=1.-weight )

In [None]:
nodeslist=final_graph.nodes
M = nx.to_scipy_sparse_matrix(final_graph, weight='weight', nodelist=nodeslist)

In [None]:
clustering = KMeans(n_clusters=int(np.log(M.shape[0])), n_jobs=14)
labels = clustering.fit_predict(M)
Counter(labels)

In [None]:
clustered_graph = {}
for (l, (s,t)) in list(zip(labels,nodeslist)):
    if l not in clustered_graph:
        clustered_graph[l] = nx.Graph()
    clustered_graph[l].add_edge(s,t)

In [None]:
nodeslist = [ v for v in sampled_G.nodes ] 

In [None]:
M = nx.to_scipy_sparse_matrix(sampled_G, weight='weight', nodelist=nodeslist)

In [None]:
clustered_graph[1].edges

In [None]:
clustering = DBSCAN(min_samples=5, eps=.5, metric='precomputed')
labels = clustering.fit_predict(M)
Counter(labels)

In [None]:
co_G = nx.Graph()
for docid,G in tqdm_notebook(enumerate(Gs), total=len(Gs)):
    for s,t,att in G.edges(data=True):
        if (s,t) not in co_G.edges:
            co_G.add_edge(s,t,count=0, weight=0)
        co_G.edges[ s,t ]['weight'] += att['weight']
        co_G.edges[ s,t ]['count'] += 1
to_remove = []
for s,t,att in co_G.edges(data=True):
    if att['count'] < 2:
        to_remove.append((s,t))
for s,t in to_remove:
    co_G.remove_edge( s,t )
remove_vertex = []
for v in co_G.nodes:
    if not co_G.degree(v):
        remove_vertex.append(v)
for v in remove_vertex:
    co_G.remove_node(v)
len(co_G.edges), len(co_G.nodes)

In [None]:
k = dict()
k['3'] = 1
k['2'] = 2
k['1'] = 3

k2 = dict()
k2['5'] = 1
k2['4'] = 2
k2['3'] = 3

In [None]:
len(set(k2.keys()).intersection(k.keys()))

In [None]:
eps = 0.5
BigG = nx.Graph()
seen = dict()
to_add = []
with pool.Pool(12) as p:
    nodes_sorted = sorted( tqdm_notebook(co_G.nodes(data=True)), key=lambda x: np.log(co_G.degree(x[0]))/(1+abs(co_G.degree(x[0])-np.mean( [ int(co_G.degree(neigh)) for _,neigh in co_G.edges(x[0]) ] ))), reverse=True )
    #nodes_sorted = nodes_sorted[:20]
    nodes_sorted = nodes_sorted[:int((1.-eps)*len(nodes_sorted))]
    for node_source,att in tqdm_notebook(nodes_sorted, position=1, total=len(nodes_sorted)):
        list_of_edges = co_G.edges(node_source, data=True)
        for _,node_target,att_target in tqdm_notebook(list_of_edges, position=2, total=len(list_of_edges), disable=True):
            if node_source != node_target:
                for _,node_to_compare,att_to_compare in co_G.edges(node_target, data=True):
                    if node_to_compare != node_source :
                        if (node_source,node_to_compare) not in seen:
                            seen[(node_source,node_to_compare)] = 1.
                        else:
                            seen[(node_source,node_to_compare)] += 1.

In [None]:

nseen = sorted(seen, key=lambda x: seen[(x[0],x[1])], reverse=True)[:int((1.-.9)*len(seen))]

with pool.Pool(12) as p:
    best_repr = list(p.imap( apply_comp, tqdm_notebook(nseen, position=1, total=len(nseen)) ))
    


In [None]:
len(best_repr)

In [None]:
BigG = nx.Graph()
sorted_list_final = sorted(best_repr, key=lambda x: x[1]/seen[x[0]], reverse=True)[:int((1.-.9)*len(best_repr))]
for ((s,t), w) in tqdm_notebook(sorted_list_final, total=len(sorted_list_final)):
    BigG.add_edge(s,t,weight=1.-w/seen[(s,t)])

In [None]:
LG = nx.Graph()
eps = 0.1
for s,t,att in tqdm_notebook(BigG.edges(data=True), total=len(BigG.edges(data=True))):
    weight1 = BigG.edges[ s,t ]['weight']
    
    to_add_edge = []
    for p_s,p_t,p_att in BigG.edges(s, data=True):
        if (p_s,p_t) != (s,t):
            weight2 = BigG.edges[ p_s,p_t ]['weight']
            to_add_edge.append( (abs(weight1-weight2), (p_s,p_t)) )
            
    for p_s,p_t,p_att in BigG.edges(t, data=True):
        if (p_s,p_t) != (s,t):
            weight2 = BigG.edges[ p_s,p_t ]['weight']
            to_add_edge.append( (abs(weight1-weight2), (p_s,p_t)) )
    for w, (p_s,p_t) in sorted(to_add_edge, key=lambda x: x[0])[:int(eps*len(to_add_edge))]:
        LG.add_edge( (s,t), (p_s,p_t), weight=w)
len(LG.edges), len(LG.nodes)

In [None]:
mmean = np.mean( [ att['weight'] for _,_,att in tqdm_notebook(BigG.edges(data=True), total=len(BigG.edges(data=True))) ] )
for s,t,att in tqdm_notebook(BigG.edges(data=True), total=len(BigG.edges(data=True))):
    BigG.edges[ p_s,p_t ]['weight'] /= mmean

In [None]:
list_of_nodes = BigG.nodes
BigG_matrix = nx.to_scipy_sparse_matrix(BigG, weight='weight', nodelist=list_of_nodes)
len(BigG_matrix.nonzero()[0]), BigG_matrix.shape

In [None]:
from sklearn.cluster import KMeans, DBSCAN, OPTICS, SpectralClustering, MeanShift
from sklearn.cluster import AgglomerativeClustering, Birch
from collections import Counter
clustering = DBSCAN(min_samples=2, eps=1.2, metric='precomputed')
labels = clustering.fit_predict(BigG_matrix)
Counter(labels)


In [None]:
clusters = {}
for idcluster, v_source in zip(labels, list(list_of_nodes)):
    if idcluster not in clusters:
        clusters[idcluster] = set()
    clusters[idcluster].add(v_source)
biggest = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)[:100]

In [None]:
biggest[0]

In [None]:
clusters = {}
for idcluster, (v_source, v_target) in zip(labels, list(list_of_nodes)):
    if idcluster not in clusters:
        clusters[idcluster] = nx.Graph()
    clusters[idcluster].add_edge(v_source, v_target)
biggest = sorted(clusters.values(), key=lambda x: len(x), reverse=True)[:100]

In [None]:
biggest[5].edges

In [None]:
sorted(best_repr, key=lambda x: x[1]/seen[x[0]], reverse=True)[:100]

In [None]:
sorted(seen.items(), key=lambda x: x[1], reverse=True)[:20]

In [None]:
best_repr = list(p.imap( apply_comp, tqdm_notebook(seen, position=1, total=len(seen)) ))
#best_repr = [  for (node_source,node_to_compare) in seen ]
#best_repr = list(filter(lambda x: x[1] > eps, best_repr))
sorted_list = sorted(best_repr, key=lambda x: x[1], reverse=True)
to_add.extend(sorted_list)
    #for ((node_to_compare,node_source), weight) in sorted_list:
    #    BigG.add_edge(node_to_compare,node_source, weight=weight)

In [None]:
sum([co_G.degree(node_source), co_G.degree(node_source), co_G.degree(node_source)])

In [None]:
LG = nx.Graph()
eps = 0.1
for s,t,att in tqdm_notebook(BigG.edges(data=True), total=len(BigG.edges(data=True))):
    weight1 = BigG.edges[ s,t ]['weight']
    
    to_add_edge = []
    for p_s,p_t,p_att in BigG.edges(s, data=True):
        if (p_s,p_t) != (s,t):
            weight2 = BigG.edges[ p_s,p_t ]['weight']
            to_add_edge.append( (abs(weight1-weight2), (p_s,p_t)) )
            
    for p_s,p_t,p_att in BigG.edges(t, data=True):
        if (p_s,p_t) != (s,t):
            weight2 = BigG.edges[ p_s,p_t ]['weight']
            to_add_edge.append( (abs(weight1-weight2), (p_s,p_t)) )
    for w, (p_s,p_t) in sorted(to_add_edge, key=lambda x: x[0])[:int(eps*len(to_add_edge))]:
        LG.add_edge( (s,t), (p_s,p_t), weight=w)
len(LG.edges), len(LG.nodes)

In [None]:
BigG.edges(data=True)

In [None]:
clustering = DBSCAN(min_samples=10, eps=0.1, metric='precomputed')
labels = clustering.fit_predict(LG_matrix)
Counter(labels)

In [None]:
clustering = KMeans(n_clusters=500, n_jobs=12)
labels = clustering.fit_predict(LG_matrix)
Counter(labels)

In [None]:
terms = {}
for idc,g in clusters.items():
    if idc > -1:
        for t in g.nodes:
            if t not in terms:
                terms[t] = set()
            terms[t].add(idc)
terms_occ = sorted(terms.items(), key=lambda x: len(x[1]), reverse=True)[:100]
[ (x,len(y)) for x,y in terms_occ ]

In [None]:
clusters[5].edges

In [None]:
word = 'good'
for idc, g in clusters.items():
    if word in g and idc > -1 and len(g) < 1000:
        print(idc, list(g.edges), len(g))

In [None]:
ss = sorted(co_G.nodes, key=lambda x: co_G.degree(x), reverse=True )

In [None]:
ss[100:110], co_G.node['american']

In [None]:
relation_index = {}
ortogonal_graphs = []
for docid,G in tqdm_notebook(enumerate(Gs), total=len(Gs)):
    ortogonal_graph = nx.Graph()
    for s,t,att in G.edges(data=True):
        if (s,t) not in relation_index:
            relation_index[(s,t)] = []
        relation_index[(s,t)].append(docid)
        
        if (s,t) not in ortogonal_graph.nodes:
            ortogonal_graph.add_node( (s,t) )
        else:
            ortogonal_graph.node[(s,t)]['weight'] = att['weight']
            
        for p_s,p_t,p_att in G.out_edges(s, data=True):
            if (p_s,p_t) != (s,t):
                ortogonal_graph.add_edge( (s,t), (p_s,p_t), weight=abs( p_att['weight'] - att['weight'] ) )
        for p_s,p_t,p_att in G.in_edges(s, data=True):
            if (p_s,p_t) != (s,t):
                ortogonal_graph.add_edge( (s,t), (p_s,p_t), weight=abs( p_att['weight'] - att['weight'] ) )
                
        for p_s,p_t,p_att in G.out_edges(t, data=True):
            if (p_s,p_t) != (s,t):
                ortogonal_graph.add_edge( (s,t), (p_s,p_t), weight=abs( p_att['weight'] - att['weight'] ) )
        for p_s,p_t,p_att in G.in_edges(t, data=True):
            if (p_s,p_t) != (s,t):
                ortogonal_graph.add_edge( (s,t), (p_s,p_t), weight=abs( p_att['weight'] - att['weight'] ) )
    ortogonal_graphs.append(ortogonal_graph)

In [None]:
final_graph = nx.Graph()
for G_ortogonal in tqdm_notebook(ortogonal_graphs, total=len(ortogonal_graphs)):
    for ( e_source, e_target, att ) in G_ortogonal.edges(data=True):
        if final_graph.has_edge( e_source, e_target ):
            final_graph.edges[(e_source,e_target)]['count'] += 1
            final_graph.edges[(e_source,e_target)]['sum'] += att['weight']
        else:
            final_graph.add_edge(e_source, e_target)
            final_graph.edges[(e_source,e_target)]['count'] = 1
            final_graph.edges[(e_source,e_target)]['sum'] = att['weight']

In [None]:
len(final_graph.nodes), len(final_graph.edges)

In [None]:
all_degree = []
for v in final_graph.nodes:
    all_degree.append( final_graph.degree(v) )
all_degree = np.array(all_degree)
plt.xscale('log')
k = plt.hist(all_degree, log=True, bins=1000)

In [None]:
k = plt.hist(all_degree, log=True, bins=1000)

In [None]:
all_degree = []
for v_s,v_t in final_graph.edges:
    all_degree.append( final_graph.edges[v_s,v_t]['sum']/final_graph.edges[v_s,v_t]['count'] )
all_degree = np.array(all_degree)
k = plt.hist(all_degree, log=True, bins=100)

In [None]:
mmax = 0.
for node_s, node_t, att in final_graph.edges(data=True):
    #weight = np.log((final_graph.degree(node_s) + final_graph.degree(node_t))/(att['sum']+1) + 1)
    #weight = np.log(final_graph.degree(node_s) + final_graph.degree(node_t)) * att['sum']/att['count']
    #weight = 1./np.log(att['count']+1)
    weight = att['sum']/np.log(att['count']+1)
    final_graph.edges[node_s, node_t]['weight'] = weight
    mmax = max(mmax, weight)
for node_s, node_t, att in final_graph.edges(data=True):
    final_graph.edges[node_s, node_t]['weight'] = att['weight']/mmax

In [None]:
list_of_nodes = final_graph.nodes
LG_matrix = nx.to_scipy_sparse_matrix(final_graph, weight='weight', nodelist=list_of_nodes)
len(LG_matrix.nonzero()[0]), LG_matrix.shape

In [None]:
from sklearn.cluster import KMeans, DBSCAN, OPTICS, SpectralClustering, MeanShift
from sklearn.cluster import AgglomerativeClustering, Birch
from collections import Counter

In [None]:
LG_matrix[LG_matrix.nonzero()].mean()

In [None]:
clustering = Birch(n_clusters=1000)
labels = clustering.fit_predict(LG_matrix)
Counter(labels)

In [None]:
clustering = DBSCAN(min_samples=5, eps=LG_matrix[LG_matrix.nonzero()].mean(), metric='precomputed')
labels = clustering.fit_predict(LG_matrix)
Counter(labels)

In [None]:
clustering = KMeans(n_clusters=2500, n_jobs=12)
labels = clustering.fit_predict(LG_matrix)

In [None]:
cc = Counter(labels)
len(cc),cc

In [None]:
clusters = {}
for idcluster, (v_source, v_target) in zip(labels, list(list_of_nodes)):
    if idcluster not in clusters:
        clusters[idcluster] = nx.DiGraph()
    clusters[idcluster].add_edge(v_source, v_target)

In [None]:
clusters = {}
for idcluster, (v_source, v_target) in zip(labels, list(list_of_nodes)):
    if idcluster not in clusters:
        clusters[idcluster] = nx.DiGraph()
    clusters[idcluster].add_edge(v_source, v_target)
biggest = sorted(clusters.values(), key=lambda x: len(x), reverse=True)[:100]

In [None]:
[ len(g) for g in biggest ]

In [None]:
biggest[2].nodes

In [None]:
terms = {}
for idc,g in clusters.items():
    if idc > -1:
        for t in g.nodes:
            if t not in terms:
                terms[t] = set()
            terms[t].add(idc)
terms_occ = sorted(terms.items(), key=lambda x: len(x[1]), reverse=True)[:100]
[ (x,len(y)) for x,y in terms_occ ]

In [None]:
import sklearn

In [None]:
sklearn.__version__

In [None]:
labels = clustering.fit_predict(LG_matrix)

In [None]:
Counter(labels)

In [None]:
cc = Counter()
cc.update({1:10})
cc.update({1:5})
cc

In [None]:
for k in comm:
    print(k)
    break

In [None]:
comm = nx_cmm.girvan_newman(final_graph)

In [None]:
clustering = Birch(n_clusters=1000)
labels = clustering.fit_predict(LG_matrix)
Counter(labels)

In [None]:
c = next(comm)

In [None]:
spc_mtx = nx.to_scipy_sparse_matrix(final_graph, weight='count')

In [None]:
spc_mtx_2 = spc_mtx / spc_mtx.max()
spc_mtx

In [None]:
spc_mtx[ spc_mtx.nonzero() ].mean()

In [None]:
final_G = nx.Graph()
for (k,v) in tqdm_notebook(relation_index.items(), total=len(relation_index)):
    for docid in v:
        final_G.add_node( (docid, k), weigth=ortogonal_graphs[docid].node[k]['weight'])

In [None]:
final_G.edges

In [None]:
for (k,v) in tqdm_notebook(relation_index.items(), total=len(relation_index)):
    for i,docid_source in enumerate(v):
        for docid_target in v[i:]:
            weight = abs( ortogonal_graphs[docid_source].node[k]['weight']-ortogonal_graphs[docid_target].node[k]['weight'] )
            final_G.add_edge((docid_source, k),(docid_target, k), weight=weight )

In [None]:
from networkx.utils import groups
from networkx.utils import not_implemented_for
from networkx.utils import py_random_state


@py_random_state(2)
def asyn_lpa_communities(G, weight=None, seed=None):

    labels = {n: i for i, n in enumerate(G)}
    cont = True
    while cont:
        cont = False
        nodes = list(G)
        seed.shuffle(nodes)
        # Calculate the label for each node
        for node in nodes:
            if len(G[node]) < 1:
                continue

            # Get label frequencies. Depending on the order they are processed
            # in some nodes with be in t and others in t-1, making the
            # algorithm asynchronous.
            label_freq = Counter()
            for v in G[node]:
                label_freq.update({labels[v]: G.edges[v, node][weight]
                                   if weight else 1})
            # Choose the label with the highest frecuency. If more than 1 label
            # has the highest frecuency choose one randomly.
            max_freq = max(label_freq.values())
            best_labels = [label for label, freq in label_freq.items()
                           if freq == max_freq]
            new_label = seed.choice(best_labels)
            labels[node] = new_label
            # Continue until all nodes have a label that is better than other
            # neighbour labels (only one label has max_freq for each node).
            cont = cont or len(best_labels) > 1

    # TODO In Python 3.3 or later, this should be `yield from ...`.
    return iter(groups(labels).values())