In [8]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt 
import random 
import networkx as nx 
import itertools 
import pickle 
from scipy.special import digamma
import timeit
from Block_PLSA.beta import Block_PLSA 
from Block_PLSA.utils_beta import draw_image_matrix,get_normalized_theta,get_top_docs,get_top_tokens,get_sub_input
import sklearn.metrics as metrics 
import lda 

In [9]:
# cora dataset
G_cora=nx.read_adjlist('data/Cora_enrich/idx_adjlist.txt',nodetype=int,create_using=nx.DiGraph)
# cora texts
texts_cora=np.loadtxt('data/Cora_enrich/BOW_texts_3876.txt',dtype=np.int)

In [10]:
with open('data/Cora_enrich/tokens_3876.pickle','rb') as f:
    tokens=pickle.load(f)
tokens=np.array(tokens)

labels=[]
with open('data/Cora_enrich/labels.txt') as f:
    for line in f:
        labels.append(line.strip())
labels=np.array(labels)

In [49]:
classes=['Neural_Networks','Genetic_Algorithms','Case_Based']

In [50]:
G_sub,labels_sub,texts_sub,_=get_sub_input(classes,G_cora,labels,texts_cora)

In [51]:
model = lda.LDA(n_topics=3, n_iter=1500, random_state=1)

In [52]:
model.fit(texts_sub)  # model.fit_transform(X) is also available

INFO:lda:n_documents: 1493
INFO:lda:vocab_size: 3876
INFO:lda:n_words: 1019235
INFO:lda:n_topics: 3
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -8481727
INFO:lda:<10> log likelihood: -7805033
INFO:lda:<20> log likelihood: -7616898
INFO:lda:<30> log likelihood: -7581490
INFO:lda:<40> log likelihood: -7566715
INFO:lda:<50> log likelihood: -7557193
INFO:lda:<60> log likelihood: -7552441
INFO:lda:<70> log likelihood: -7548674
INFO:lda:<80> log likelihood: -7543948
INFO:lda:<90> log likelihood: -7540901
INFO:lda:<100> log likelihood: -7539250
INFO:lda:<110> log likelihood: -7535990
INFO:lda:<120> log likelihood: -7535080
INFO:lda:<130> log likelihood: -7534002
INFO:lda:<140> log likelihood: -7531427
INFO:lda:<150> log likelihood: -7530999
INFO:lda:<160> log likelihood: -7531381
INFO:lda:<170> log likelihood: -7530064
INFO:lda:<180> log likelihood: -7528523
INFO:lda:<190> log likelihood: -7528281
INFO:lda:<200> log likelihood: -7528668
INFO:lda:<210> log likelihood: -7528503
INFO:lda:

<lda.lda.LDA at 0x26da1fbe470>

In [53]:
topic_word = model.topic_word_

In [54]:
doc_topic = model.doc_topic_

In [55]:
get_top_tokens(topic_word,tokens)

[[('case', 0.012759877197234917),
  ('genet', 0.011164245054113012),
  ('gen', 0.009912207434469234),
  ('program', 0.009131310790200766),
  ('ga', 0.008811143166050694),
  ('search', 0.008189028839450148),
  ('design', 0.007345660463640202),
  ('process', 0.007087964571031608),
  ('knowledg', 0.0069031523652214035),
  ('popul', 0.0067131341817827424)],
 [('data', 0.009429243880441685),
  ('linear', 0.007117420690470497),
  ('case', 0.0065296690320032465),
  ('time', 0.00616022513239526),
  ('train', 0.006129438140761261),
  ('valu', 0.006011887809067811),
  ('approxim', 0.005589266378455643),
  ('weight', 0.005578071108770553),
  ('point', 0.005566875839085462),
  ('error', 0.005541686482294009)],
 [('network', 0.03358962035060991),
  ('et', 0.012001457012097433),
  ('neural', 0.011623601177701065),
  ('al', 0.011612805296718312),
  ('input', 0.008539577843627855),
  ('train', 0.008280476700041773),
  ('neuron', 0.007495976015295029),
  ('unit', 0.007474384253329522),
  ('sequenc', 0.

In [56]:
y_pred=[]
for i in range(doc_topic.shape[0]):
    y_pred.append(np.argmax(doc_topic[i,:])) 

In [57]:
metrics.adjusted_mutual_info_score(labels_sub,y_pred)

0.39843862298937416

In [45]:
idx_class_dict={0:'Genetic_Algorithms',1:'Neural_Networks',2:'Theory'}

In [46]:
labels_pred=[idx_class_dict[i] for i in y_pred]

In [47]:
metrics.f1_score(labels_sub,labels_pred,average='macro')

0.8313276348622765

In [48]:
metrics.accuracy_score(labels_sub,labels_pred)

0.8350582147477361