In [40]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt 
import random 
import networkx as nx 
import itertools 
import pickle 
from scipy.special import digamma

# load data

In [8]:
# cora dataset
G_cora=nx.read_adjlist('data/Cora_enrich/idx_adjlist.txt',nodetype=int,create_using=nx.DiGraph)
# subgraph from G ,with nodes {0-99}
G_cora_mini=G_cora.subgraph(list(range(100)))
# cora texts
texts_cora=np.loadtxt('data/Cora_enrich/BOW_texts.txt',dtype=np.int)
texts_cora_mini=texts_cora[:100,:]

# Block_PLSA

## utils

In [34]:
def index2ij(index,K):
    row=int(index/K) 
    col=index%K
    return row,col 

## input tranformation

In [113]:
# transfer input to observed variables
# PLSA part
ii,jj=np.nonzero(texts_cora_mini)
WS=np.repeat(jj,texts_cora_mini[ii,jj])
DS=np.repeat(ii,texts_cora_mini[ii,jj])
# blockmodel part
SS=[]
RS=[]
for e in G_cora_mini.edges:
    SS.append(e[0])
    RS.append(e[1])
SS=np.array(SS,dtype=np.int)
RS=np.array(RS,dtype=np.int)

## initialization

In [160]:
# model hyperparameters 
alpha=1e-2
K=3
D=texts_cora_mini.shape[0]
V=texts_cora_mini.shape[1]
L=len(SS)
N=len(WS)
# runtime parameters
n_iter_EM=10
n_iter_VI=10

In [159]:
# Initialize EM parameters
omega=np.zeros((K,D))
omega[:]=1/D
phi=np.zeros((K,V))
phi[:]=1/V
pi=np.zeros(K)
pi[:]=1/K 
    
# initialize VI parameters
gamma=np.zeros(K**2)
gamma[:]=1e-1
delta=np.zeros((L,K**2))
delta[:]=1/K**2
epsilon=np.zeros((N,K))
epsilon[:]=1/K 

In [161]:
# Initialize EM parameters randomly
beta=1e-1

omega=np.zeros((K,D))
for k in range(K):
    omega[k,:]=stats.dirichlet.rvs(np.repeat(beta,D))
    
phi=np.zeros((K,V))
for k in range(K):
    phi[k,:]=stats.dirichlet.rvs(np.repeat(beta,V))
    
pi=stats.dirichlet.rvs(np.repeat(beta,K)).flatten()
    
# initialize VI parameters
gamma=stats.dirichlet.rvs(np.repeat(beta,K**2))

delta=np.zeros((L,K**2))
for l in range(L):
    delta[l,:]=stats.dirichlet.rvs(np.repeat(beta,K**2))

epsilon=np.zeros((N,K))
for n in range(N):
    epsilon[n,:]=stats.dirichlet.rvs(np.repeat(beta,K))

In [162]:
# variational-EM
for it_em in range(n_iter_EM):
    # E-step
    for it_vi in range(n_iter_VI):
        # solve gamma&delta
        gamma=np.array([alpha+delta[:,k].sum() for k in range(K**2)])
        delta=np.array([[omega[index2ij(k,K)[0],SS[l]]*omega[index2ij(k,K)[1],RS[l]]*np.exp(digamma(gamma[k])) for k in range(K**2)]\
                        for l in range(L)])
        delta=delta/delta.sum(axis=1)[:,np.newaxis]
        print('gamma:'+str(gamma))
    # solve epsilon  
    epsilon=np.array([[omega[k,DS[n]]*phi[k,WS[n]]*pi[k] for k in range(K)] for n in range(N)])
    epsilon=epsilon/epsilon.sum(axis=1)[:,np.newaxis]
    # M-step
    # omega
    S_dist=np.zeros((L,K))
    R_dist=np.zeros((L,K))
    for l in range(L):
        S_dist[l,:]=delta[l,:].reshape(K,K).sum(axis=1)
        R_dist[l,:]=delta[l,:].reshape(K,K).sum(axis=0)
    term_1=term_2=term_3=0
    for d in range(D):
        ep_ids=np.where(DS==d)[0]
        term_1=epsilon[ep_ids,:].sum(axis=0)
        S_ids=np.where(SS==d)[0]
        term_2=S_dist[S_ids,:].sum(axis=0)
        R_ids=np.where(RS==d)[0]
        term_3=R_dist[R_ids,:].sum(axis=0)
        omega[:,d]=term_1+term_2+term_3
    omega=omega/omega.sum(axis=1)[:,np.newaxis]
    # phi
    for w in range(V):
        ep_ids=np.where(WS==w)[0]
        phi[:,w]=epsilon[ep_ids,:].sum(axis=0)
    phi=phi/phi.sum(axis=1)[:,np.newaxis]
    # pi
    pi=epsilon.sum(axis=0)
    pi=pi/pi.sum()
    print('%d iter...'%(it_em))

gamma:[1.65162226 2.46266767 4.64564961 6.70627343 2.45756535 4.70440935
 3.93725333 2.10447683 2.42008217]
gamma:[1.9618268  2.64602403 1.15224164 5.12737217 3.8803186  3.74150142
 5.39141353 2.72141092 4.46789089]
gamma:[2.03610283 2.61406946 0.62997006 4.83443842 4.30796448 3.94889259
 5.19647809 2.68831798 4.83376608]
gamma:[2.06434538 2.58431664 0.32361963 4.7484433  4.40419011 4.26582801
 5.10214947 2.664512   4.93259546]
gamma:[2.07645865 2.57579245 0.07310623 4.70101874 4.42238368 4.55043098
 5.06901757 2.65804701 4.96374468]
gamma:[2.08164115 2.57279626 0.01000114 4.66674487 4.4208501  4.65124924
 5.05758687 2.65656168 4.97256869]
gamma:[2.08419278 2.57176258 0.01       4.65213645 4.41926845 4.66703859
 5.05394851 2.6564702  4.97518243]
gamma:[2.08533006 2.57147764 0.01       4.64828089 4.41911832 4.67034723
 5.05280174 2.65655297 4.97609114]
gamma:[2.08574243 2.57132783 0.01       4.64729823 4.41914006 4.67110343
 5.05240474 2.65658366 4.97639961]
gamma:[2.08588373 2.57124776

6 iter...
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.25029263e+01 1.00000000e-02 8.51707370e+00 1.00000000e-02
 1.00000000e-02]
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.26917044e+01 1.00000000e-02 8.32829564e+00 1.00000000e-02
 1.00000000e-02]
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.27757319e+01 1.00000000e-02 8.24426808e+00 1.00000000e-02
 1.00000000e-02]
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.28133389e+01 1.00000000e-02 8.20666108e+00 1.00000000e-02
 1.00000000e-02]
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.28302123e+01 1.00000000e-02 8.18978769e+00 1.00000000e-02
 1.00000000e-02]
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.28377916e+01 1.00000000e-02 8.18220841e+00 1.00000000e-02
 1.00000000e-02]
gamma:[1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 2.28411978e+01 1.00000000e-02 8.17880217e+00 1.00000

In [169]:
# derive expectation from posterior (variational) distribution
theta=gamma/gamma.sum()
theta=theta.reshape(K,K)

# evaluation 

## utils

In [176]:
with open('data/Cora_enrich/tokens.pickle','rb') as f:
    tokens=pickle.load(f)

In [186]:
labels=[]
with open('data/Cora_enrich/labels.txt') as f:
    for line in f:
        labels.append(line.strip())
labels=np.array(labels)
labels_mini=labels[:100]

In [177]:
def get_top_tokens(phi,tokens,top=10):
    results=[]
    for i in range(phi.shape[0]):
        results.append(tokens[np.argsort(-phi[i,:])[:top]])
    return results 

In [188]:
def get_top_docs(omega,labels,top=10):
    results=[]
    for i in range(omega.shape[0]):
        results.append(labels[np.argsort(-omega[i,:])[:top]])
    return results 

## evaluate 

In [170]:
theta 

array([[3.21646832e-04, 3.21646832e-04, 3.21646832e-04],
       [3.21646832e-04, 7.50639648e-01, 3.21646832e-04],
       [2.47108824e-01, 3.21646832e-04, 3.21646832e-04]])

In [178]:
get_top_tokens(phi,tokens,10)

[array(['merg', 'learn', 'exampl', 'use', 'set', 'model', 'algorithm',
        'problem', 'system', 'attribut'], dtype='<U63'),
 array(['use', 'algorithm', 'case', 'system', 'problem', 'learn',
        'function', 'base', 'gener', 'model'], dtype='<U63'),
 array(['distribut', 'case', 'probabl', 'paper', 'result', 'sequenc',
        'see', 'al', 'use', 'theori'], dtype='<U63')]

In [189]:
get_top_docs(omega,labels_mini,10)

[array(['Rule_Learning', 'Case_Based', 'Probabilistic_Methods',
        'Genetic_Algorithms', 'Probabilistic_Methods', 'Theory',
        'Case_Based', 'Theory', 'Rule_Learning', 'Theory'], dtype='<U22'),
 array(['Genetic_Algorithms', 'Probabilistic_Methods',
        'Genetic_Algorithms', 'Case_Based', 'Neural_Networks', 'Theory',
        'Probabilistic_Methods', 'Neural_Networks', 'Neural_Networks',
        'Genetic_Algorithms'], dtype='<U22'),
 array(['Theory', 'Neural_Networks', 'Probabilistic_Methods',
        'Genetic_Algorithms', 'Genetic_Algorithms', 'Theory',
        'Neural_Networks', 'Case_Based', 'Neural_Networks',
        'Neural_Networks'], dtype='<U22')]

In [166]:
pi 

array([0.4188047 , 0.46921381, 0.11198149])

In [167]:
phi 

array([[6.85477601e-05, 1.41776384e-03, 1.50295957e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.26849799e-05, 6.45454743e-13, 2.16534675e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.84756161e-14, 2.90765742e-05, 3.20254078e-13, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [168]:
omega

array([[2.39722539e-02, 8.62296576e-03, 5.00249942e-03, 1.52731947e-02,
        6.98345762e-03, 6.33534337e-04, 2.00734222e-02, 1.08789499e-02,
        6.79183417e-03, 9.61248611e-02, 1.76413378e-02, 1.23799961e-02,
        1.13520390e-02, 1.90177004e-02, 2.48976693e-02, 2.12050833e-02,
        2.71926139e-02, 1.79132626e-04, 3.72982908e-02, 1.21968237e-02,
        9.62137619e-04, 1.54398556e-03, 1.24723975e-02, 1.06835250e-05,
        2.59936060e-07, 1.33241811e-03, 4.71725467e-03, 4.54675288e-03,
        8.44966701e-03, 1.86091919e-04, 1.80768890e-02, 2.88099465e-02,
        7.95367776e-03, 1.55694675e-02, 3.69280889e-03, 3.23292090e-03,
        3.40846176e-03, 1.01909166e-03, 8.11685264e-04, 4.86853025e-03,
        2.25772848e-02, 1.34992704e-02, 1.09129521e-04, 2.02019698e-02,
        7.85155343e-03, 9.05570083e-04, 9.95694754e-03, 5.36198902e-03,
        4.33044470e-06, 4.06472380e-03, 3.16722891e-03, 2.61145991e-03,
        2.37915841e-02, 1.51951649e-02, 2.17037815e-02, 6.331719

In [110]:
    S_dist=np.zeros((L,K))
    R_dist=np.zeros((L,K))
    for l in range(L):
        S_dist[l,:]=delta[l,:].reshape(K,K).sum(axis=1)
        R_dist[l,:]=delta[l,:].reshape(K,K).sum(axis=0)
    term_1=term_2=term_3=0
    for d in range(D):
        ep_ids=np.where(DS==d)[0]
        term_1=epsilon[ep_ids,:].sum(axis=0)
        S_ids=np.where(SS==d)[0]
        term_2=S_dist[S_ids,:].sum(axis=0)
        R_ids=np.where(RS==d)[0]
        term_3=R_dist[R_ids,:].sum(axis=0)
        omega[:,d]=term_1+term_2+term_3

In [109]:
omega.shape  

(7, 100)

In [102]:
term_1 

array([200.71428571, 200.71428571, 200.71428571, 200.71428571,
       200.71428571, 200.71428571, 200.71428571])

In [107]:
S_dist[[],:].sum(axis=0)

array([0., 0., 0., 0., 0., 0., 0.])

In [105]:
S_ids

array([], dtype=int64)

In [103]:
term_2

array([0., 0., 0., 0., 0., 0., 0.])

In [104]:
term_3

array([0., 0., 0., 0., 0., 0., 0.])

In [101]:
omega[:,-1]

array([200.71428571, 200.71428571, 200.71428571, 200.71428571,
       200.71428571, 200.71428571, 200.71428571])

In [96]:
S_ids 

array([], dtype=int64)

In [97]:
epsilon[ep_ids,:].shape 

(1405, 7)

In [93]:
np.repeat(0.1,7)

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

In [94]:
term_2+np.repeat(0.1,7)

array([], shape=(0, 7), dtype=float64)

In [77]:
omega.shape 

(7, 100)

In [72]:
l=1 
delta[l,:].reshape(K,K).sum(axis=1)

array([0.14285714, 0.14285714, 0.14285714, 0.14285714, 0.14285714,
       0.14285714, 0.14285714])

In [63]:
np.array([[omega[k,DS[n]]*phi[k,WS[n]]*pi[k] for k in range(K)] for n in range(N)])

(82074, 7)

In [52]:
[[omega[index2ij(k,K)[0],SS[l]]*omega[index2ij(k,K)[1],RS[l]]*np.exp(digamma(gamma[k])) for k in range(K**2)] for l in range(L)]

[[5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.6145948356688514e-05,
  5.61459483

In [36]:
[[i+j for j in range(3)] for i in range(3)]

[[0, 1, 2], [1, 2, 3], [2, 3, 4]]

In [14]:
WS

array([  21,   22,   22, ..., 3934, 3934, 3957], dtype=int64)

In [3]:
# variational-EM (VBEM) for Block_PLSA