In [1]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt 
import random 
import networkx as nx 
import itertools 
import pickle 
from scipy.special import digamma
import timeit
from Block_PLSA.beta import Block_PLSA 
from Block_PLSA.utils_beta import draw_image_matrix,get_normalized_theta,get_top_docs,get_top_tokens,get_sub_input
import sklearn.metrics as metrics 
import lda 

In [2]:
# cora dataset
G_cora=nx.read_adjlist('data/Cora_enrich/idx_adjlist.txt',nodetype=int,create_using=nx.DiGraph)
# cora texts
texts_cora=np.loadtxt('data/Cora_enrich/BOW_texts_3876.txt',dtype=np.int)

In [3]:
with open('data/Cora_enrich/tokens_3876.pickle','rb') as f:
    tokens=pickle.load(f)
tokens=np.array(tokens)

labels=[]
with open('data/Cora_enrich/labels.txt') as f:
    for line in f:
        labels.append(line.strip())
labels=np.array(labels)

In [4]:
set(labels)

{'Case_Based',
 'Genetic_Algorithms',
 'Neural_Networks',
 'Probabilistic_Methods',
 'Reinforcement_Learning',
 'Rule_Learning',
 'Theory'}

In [5]:
classes=['Neural_Networks','Genetic_Algorithms','Reinforcement_Learning']

In [6]:
G_sub,labels_sub,texts_sub,_=get_sub_input(classes,G_cora,labels,texts_cora)

In [7]:
model = lda.LDA(n_topics=3, n_iter=1500, random_state=1)

In [8]:
model.fit(texts_sub)  # model.fit_transform(X) is also available

INFO:lda:n_documents: 1405
INFO:lda:vocab_size: 3876
INFO:lda:n_words: 963123
INFO:lda:n_topics: 3
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -8007433
INFO:lda:<10> log likelihood: -7368719
INFO:lda:<20> log likelihood: -7192415
INFO:lda:<30> log likelihood: -7164994
INFO:lda:<40> log likelihood: -7150654
INFO:lda:<50> log likelihood: -7144616
INFO:lda:<60> log likelihood: -7140266
INFO:lda:<70> log likelihood: -7133214
INFO:lda:<80> log likelihood: -7128634
INFO:lda:<90> log likelihood: -7122945
INFO:lda:<100> log likelihood: -7119518
INFO:lda:<110> log likelihood: -7115283
INFO:lda:<120> log likelihood: -7115296
INFO:lda:<130> log likelihood: -7113622
INFO:lda:<140> log likelihood: -7109108
INFO:lda:<150> log likelihood: -7106522
INFO:lda:<160> log likelihood: -7105707
INFO:lda:<170> log likelihood: -7105368
INFO:lda:<180> log likelihood: -7103392
INFO:lda:<190> log likelihood: -7105616
INFO:lda:<200> log likelihood: -7103282
INFO:lda:<210> log likelihood: -7102610
INFO:lda:<

<lda.lda.LDA at 0x2066bd44198>

In [9]:
topic_word = model.topic_word_

In [10]:
doc_topic = model.doc_topic_

In [11]:
get_top_tokens(topic_word,tokens)

[[('network', 0.021122224436848306),
  ('train', 0.009111688719769516),
  ('data', 0.00799321031177673),
  ('neural', 0.007936412580120848),
  ('input', 0.006820118700268671),
  ('weight', 0.006381028544005878),
  ('linear', 0.005841450093274983),
  ('time', 0.005793390474181543),
  ('case', 0.005194829763654154),
  ('error', 0.005026621096827113)],
 [('genet', 0.020014080809131196),
  ('gen', 0.01753883054611857),
  ('ga', 0.015842569658243995),
  ('program', 0.014846938267535007),
  ('popul', 0.01240395383662869),
  ('search', 0.011717152553222953),
  ('fit', 0.010716911757927348),
  ('gp', 0.008343068395820266),
  ('evolv', 0.007988144242650859),
  ('select', 0.007923612578438238)],
 [('al', 0.009059852888515437),
  ('et', 0.00843945982688053),
  ('state', 0.00825230214348229),
  ('network', 0.00815872330178317),
  ('task', 0.00815872330178317),
  ('control', 0.007105094861911596),
  ('time', 0.006751574793270475),
  ('action', 0.006519360630535621),
  ('environ', 0.0057152757685283

In [12]:
y_pred=[]
for i in range(doc_topic.shape[0]):
    y_pred.append(np.argmax(doc_topic[i,:])) 

In [13]:
metrics.adjusted_mutual_info_score(labels_sub,y_pred)

0.3402862335871631

In [14]:
idx_class_dict={0:'Neural_Networks',1:'Genetic_Algorithms',2:'Reinforcement_Learning'}

In [15]:
labels_pred=[idx_class_dict[i] for i in y_pred]

In [16]:
metrics.f1_score(labels_sub,labels_pred,average='macro')

0.7036739620317342

In [17]:
metrics.accuracy_score(labels_sub,labels_pred)

0.7274021352313167