[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Sohrabbeig/TopicModeling/blob/master/src/NPB.ipynb)

# Topic Modling using NPB

## Import needed datasets

In [12]:
import os
import pandas as pd
from gensim import models
from gensim import corpora
import pyLDAvis as pyldavis

## Define needed functions

In [2]:
def sparse_bbow(words):
    return [(index, value) for index,value in words.items() if value == 1]

## Load Dataset

### cora.cites

In [3]:
data_dir = os.path.expanduser("../Datasets/cora")

edgelist = pd.read_csv(os.path.join(data_dir, "cora.cites"), sep='\t', header=None, names=["target", "source"])
edgelist.head(5)

Unnamed: 0,target,source
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960


### cora.content

In [4]:
feature_names = [i for i in range(1433)]
column_names =  feature_names + ["subject"]
node_data = pd.read_csv(os.path.join(data_dir, "cora.content"), sep='\t', header=None, names=column_names)
node_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,...,1363,1364,1365,1366,1367,1368,1369,1370,1371,1372,1373,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383,1384,1385,1386,1387,1388,1389,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420,1421,1422,1423,1424,1425,1426,1427,1428,1429,1430,1431,1432,subject
31336,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,Neural_Networks
1061127,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,Rule_Learning
1106406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
13195,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
37879,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Probabilistic_Methods


## Transform dataset to ideal format

In [14]:
bbow_corpus = [sparse_bbow(row[1]) for row in node_data.iterrows()]
texts = [[str(index) for (index,value) in b] for b in bbow_corpus]
dictionary = corpora.Dictionary(texts)

In [22]:
tfidf = models.TfidfModel(bbow_corpus)
print(tfidf[sparse_bbow(node_data.iloc[0])])

[(118, 0.23814769376007022), (125, 0.20535402833474178), (176, 0.2220479218781867), (252, 0.2819939177104131), (351, 0.27571578988483514), (456, 0.23923943345356208), (507, 0.08560014618353899), (521, 0.2106632550168035), (619, 0.14671066466242966), (648, 0.1912097976554182), (698, 0.18496429700987266), (702, 0.2072918321734374), (734, 0.2569841476824823), (845, 0.3455025756073674), (902, 0.31651196101521617), (1205, 0.2072918321734374), (1209, 0.09462368861676156), (1236, 0.23814769376007022), (1352, 0.14018728131527128), (1426, 0.1953932095032192)]


## Train LDA

In [16]:
lda = models.LdaModel(bbow_corpus, num_topics=7)
lda.save("model1.gensim")

topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.023*"1209" + 0.017*"19" + 0.014*"774" + 0.013*"1263"')
(1, '0.023*"1177" + 0.019*"507" + 0.013*"1263" + 0.011*"495"')
(2, '0.025*"1177" + 0.020*"1263" + 0.016*"507" + 0.015*"1209"')
(3, '0.018*"774" + 0.017*"1177" + 0.017*"19" + 0.014*"1263"')
(4, '0.023*"19" + 0.022*"1263" + 0.020*"1177" + 0.016*"507"')
(5, '0.026*"1177" + 0.017*"1263" + 0.013*"1209" + 0.013*"507"')
(6, '0.029*"1263" + 0.020*"1177" + 0.013*"1075" + 0.012*"1381"')


In [18]:
lda_display = pyldavis.gensim.prepare(lda, bbow_corpus, dictionary)
pyldavis.display(lda_display)