In [1]:
import logging
import numpy as np
import pandas as pd
from collections import defaultdict
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import PorterStemmer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
node_info = pd.read_csv('../data/node_information.csv', header=None)
node_info.columns = ['id', 'year', 'title', 'authors', 'journal', 'abstract']
node_info.head()

Unnamed: 0,id,year,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [3]:
stemmer = PorterStemmer()
def tokenizer(line):
    tokens = [token for token in line.strip().split() if len(token)>1 and token not in STOPWORDS]
    tokens = [''.join([elt for elt in token if not elt.isdigit()]) for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = [token for token in tokens if len(token)>1 and token not in STOPWORDS]
    return tokens

In [4]:
texts = [tokenizer(node_info.loc[i, 'abstract']) for i in range(len(node_info))]

In [5]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token]>2] for text in texts]

In [6]:
dictionary = gensim.corpora.Dictionary(texts)
dictionary.filter_extremes(no_above=0.6, no_below=5)
dictionary.save('gensim/word2id.dict')

2018-03-11 00:36:12,470 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-03-11 00:36:13,323 : INFO : adding document #10000 to Dictionary(8576 unique tokens: ['benefit', 'tehran', 'post-big', 'travel', 'bremsstrahlung']...)
2018-03-11 00:36:14,210 : INFO : adding document #20000 to Dictionary(10040 unique tokens: ['benefit', 'tehran', 'post-big', 'travel', 'bremsstrahlung']...)
2018-03-11 00:36:14,714 : INFO : built Dictionary(10313 unique tokens: ['benefit', 'tehran', 'post-big', 'travel', 'bremsstrahlung']...) from 27770 documents (total 1448276 corpus positions)
2018-03-11 00:36:14,729 : INFO : discarding 3632 tokens: [('kxt', 3), ('poor', 4), ('maxima', 4), ('ivanov', 4), ('disclin', 4), ('chromofield', 2), ('sin-gordon', 3), ('-soliton', 4), ('vein', 3), ("manifold'", 3)]...
2018-03-11 00:36:14,730 : INFO : keeping 6681 tokens which were in no less than 5 and no more than 16662 (=60.0%) documents
2018-03-11 00:36:14,748 : INFO : resulting dictionary: Dictionary(

In [7]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [8]:
print(dictionary)

Dictionary(6681 unique tokens: ['benefit', 'tehran', 'travel', 'bremsstrahlung', 'modifi']...)


In [9]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary)
tfidf.save('gensim/model.tfidf')
corpus_tfidf = tfidf[corpus]
gensim.corpora.MmCorpus.serialize('gensim/corpus_tfidf.mm', corpus_tfidf)

2018-03-11 00:36:16,209 : INFO : collecting document frequencies
2018-03-11 00:36:16,212 : INFO : PROGRESS: processing document #0
2018-03-11 00:36:16,314 : INFO : PROGRESS: processing document #10000
2018-03-11 00:36:16,392 : INFO : PROGRESS: processing document #20000
2018-03-11 00:36:16,461 : INFO : calculating IDF weights for 27770 documents and 6680 features (1085999 matrix non-zeros)
2018-03-11 00:36:16,465 : INFO : saving TfidfModel object under gensim/model.tfidf, separately None
2018-03-11 00:36:16,469 : INFO : saved gensim/model.tfidf
2018-03-11 00:36:16,470 : INFO : storing corpus in Matrix Market format to gensim/corpus_tfidf.mm
2018-03-11 00:36:16,471 : INFO : saving sparse matrix to gensim/corpus_tfidf.mm
2018-03-11 00:36:16,471 : INFO : PROGRESS: saving document #0
2018-03-11 00:36:16,624 : INFO : PROGRESS: saving document #1000
2018-03-11 00:36:16,806 : INFO : PROGRESS: saving document #2000
2018-03-11 00:36:16,946 : INFO : PROGRESS: saving document #3000
2018-03-11 00:

In [10]:
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=4200)
lsi.save('gensim/model_4200.lsi')

2018-03-11 00:36:20,304 : INFO : using serial LSI version on this node
2018-03-11 00:36:20,306 : INFO : updating model with new documents
2018-03-11 00:36:21,381 : INFO : preparing a new chunk of documents
2018-03-11 00:36:21,575 : INFO : using 100 extra samples and 2 power iterations
2018-03-11 00:36:21,576 : INFO : 1st phase: constructing (6681, 4300) action matrix
2018-03-11 00:36:29,888 : INFO : orthonormalizing (6681, 4300) action matrix
2018-03-11 00:37:55,358 : INFO : 2nd phase: running dense svd on (4300, 20000) matrix
2018-03-11 00:40:30,541 : INFO : computing the final decomposition
2018-03-11 00:40:30,542 : INFO : keeping 4200 factors (discarding 0.209% of energy spectrum)
2018-03-11 00:40:36,564 : INFO : processed documents up to #20000
2018-03-11 00:40:36,568 : INFO : topic #0(23.890): 0.157*"string" + 0.151*"model" + 0.141*"gaug" + 0.140*"field" + 0.139*"theori" + 0.129*"solut" + 0.114*"algebra" + 0.108*"quantum" + 0.103*"gener" + 0.102*"equat"
2018-03-11 00:40:36,569 : I

In [11]:
vecs_tuple = lsi[corpus_tfidf]

In [12]:
num_topics = 4200
docs_vec = np.zeros((len(vecs_tuple), num_topics))
for r, tuples in enumerate(vecs_tuple):
    for t in tuples:
        docs_vec[r, t[0]] = t[1]
    if r%2000 == 0:
        print('{} rows treated'.format(r))

0 rows treated
2000 rows treated
4000 rows treated
6000 rows treated
8000 rows treated
10000 rows treated
12000 rows treated
14000 rows treated
16000 rows treated
18000 rows treated
20000 rows treated
22000 rows treated
24000 rows treated
26000 rows treated


In [13]:
np.savetxt('gensim/docs_vec.txt', docs_vec, delimiter=' ')

## Playing

In [None]:
corpus_tfidf[13456]

In [None]:
from scipy.sparse import csr_matrix
row = []
col = []
data = []
for cnt, doc in enumerate(corpus_tfidf):
    row += [cnt for _ in doc]
    col += [t[0] for t in doc]
    data += [t[1] for t in doc]
row = np.array(row)
col = np.array(col)
data = np.array(data)

In [None]:
tfidf_matrix = csr_matrix((data, (row, col)), shape=(len(corpus_tfidf), max(col)+1))

In [None]:
tfidf_matrix[13456, 9578]

In [None]:
from scipy.sparse.linalg import svds
u, s, v = svds(tfidf_matrix, k=min(tfidf_matrix.shape)-1)

In [None]:
s

In [None]:
s_square = s ** 2
s_square

In [None]:
s_sort = s_square[np.argsort(s_square)[::-1]]
s_sort

In [None]:
s_cum = s_sort.cumsum()/sum(s_sort) * 100
s_cum

In [None]:
s_cum[4190]

In [None]:
s_sort.sum()

In [None]:
np.savetxt('gensim/tmp/s.txt', s, delimiter=' ')
np.savetxt('gensim/tmp/s_sort.txt', s_sort, delimiter=' ')
np.savetxt('gensim/tmp/s_cum.txt', s_cum, delimiter=' ')

In [None]:
from scipy.sparse import save_npz
save_npz('gensim/tmp/tfidf_matrix.npz', tfidf_matrix)