In [1]:
import logging
import numpy as np
import pandas as pd
from collections import defaultdict
import gensim
from gensim.parsing.preprocessing import STOPWORDS

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
node_info = pd.read_csv('data/node_information.csv', header=None)
node_info.columns = ['id', 'year', 'title', 'authors', 'journal', 'abstract']
node_info.head()

Unnamed: 0,id,year,title,authors,journal,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


In [3]:
def tokenizer(line):
    return [token for token in line.split() if len(token)>1 and token not in STOPWORDS]

In [5]:
texts = [tokenizer(node_info.loc[i, 'abstract']) for i in range(len(node_info))]

In [6]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token]>2] for text in texts]

In [7]:
dictionary = gensim.corpora.Dictionary(texts)
dictionary.filter_extremes(no_above=0.6, no_below=5)
dictionary.save('gensim/word2id.dict')

In [8]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [9]:
print(dictionary)

Dictionary(10009 unique tokens: ['linear', 'rigidly', 'suggests', 'proportional', 'nlie']...)


In [10]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary)
tfidf.save('gensim/model.tfidf')
corpus_tfidf = tfidf[corpus]
gensim.corpora.MmCorpus.serialize('gensim/corpus_tfidf.mm', corpus_tfidf)

In [11]:
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=4200)
lsi.save('gensim/model_4200.lsi')

In [12]:
vecs_tuple = lsi[corpus_tfidf]

(27770, 4200)

In [27]:
num_topics = 4200
docs_vec = np.zeros((len(vecs_tuple), num_topics))
for r, tuples in enumerate(vecs_tuple):
    for t in tuples:
        docs_vec[r, t[0]] = t[1]
    if r%2000 == 0:
        print('{} rows treated'.format(r))

0 rows treated
2000 rows treated
4000 rows treated
6000 rows treated
8000 rows treated
10000 rows treated
12000 rows treated
14000 rows treated
16000 rows treated
18000 rows treated
20000 rows treated
22000 rows treated
24000 rows treated
26000 rows treated


In [32]:
np.savetxt('gensim/docs_vec.txt', docs_vec, delimiter=' ')

## Playing

In [44]:
corpus_tfidf[13456]

[(8, 0.060659075903626285),
 (12, 0.17703207586624298),
 (23, 0.04827692459504797),
 (32, 0.02782021526642745),
 (33, 0.25496337310366146),
 (36, 0.09968650576031687),
 (66, 0.05106095320939458),
 (146, 0.06178743760211393),
 (148, 0.029563155984323176),
 (151, 0.06653842690455128),
 (182, 0.054663498054397984),
 (186, 0.04545119925514492),
 (203, 0.03717496795443163),
 (210, 0.13238297108258432),
 (230, 0.03364115374497458),
 (263, 0.02171137615292644),
 (330, 0.032150427140223946),
 (336, 0.042938448511736546),
 (403, 0.050843468606784105),
 (406, 0.04781133813894132),
 (411, 0.04677091482282372),
 (414, 0.13242485430119386),
 (448, 0.05216519949618291),
 (456, 0.047766624430755224),
 (535, 0.08214432506684421),
 (541, 0.04475070193644054),
 (553, 0.06058493441261363),
 (556, 0.06181401965186967),
 (557, 0.0514207238272),
 (573, 0.05818336483019008),
 (614, 0.047777791275269155),
 (634, 0.0627166726475041),
 (638, 0.05212130483251284),
 (653, 0.05753752194806081),
 (685, 0.0576811198

In [39]:
from scipy.sparse import csr_matrix
row = []
col = []
data = []
for cnt, doc in enumerate(corpus_tfidf):
    row += [cnt for _ in doc]
    col += [t[0] for t in doc]
    data += [t[1] for t in doc]
row = np.array(row)
col = np.array(col)
data = np.array(data)

In [42]:
tfidf_matrix = csr_matrix((data, (row, col)), shape=(len(corpus_tfidf), max(col)+1))

In [45]:
tfidf_matrix[13456, 9578]

0.1299592223490542

In [49]:
from scipy.sparse.linalg import svds
u, s, v = svds(tfidf_matrix, k=min(tfidf_matrix.shape)-1)

In [50]:
s

array([0.06334587, 0.09883528, 0.10469659, ..., 0.        , 0.        ,
       0.        ])

In [84]:
s_square = s ** 2
s_square

array([0.0040127 , 0.00976841, 0.01096138, ..., 0.        , 0.        ,
       0.        ])

In [85]:
s_sort = s_square[np.argsort(s_square)[::-1]]
s_sort

array([551.79804102, 146.29609222, 110.43499595, ...,   0.        ,
         0.        ,   0.        ])

In [87]:
s_cum = s_sort.cumsum()/sum(s_sort) * 100
s_cum

array([  1.98702932,   2.51384276,   2.91152009, ..., 100.        ,
       100.        , 100.        ])

In [115]:
s_cum[4190]

90.0118202597853

In [83]:
s_sort.sum()

12819.48841024983

In [103]:
np.savetxt('gensim/tmp/s.txt', s, delimiter=' ')
np.savetxt('gensim/tmp/s_sort.txt', s_sort, delimiter=' ')
np.savetxt('gensim/tmp/s_cum.txt', s_cum, delimiter=' ')

In [123]:
from scipy.sparse import save_npz
save_npz('gensim/tmp/tfidf_matrix.npz', tfidf_matrix)