In [1]:
import os
import logging
import pickle
import pandas as pd
import spacy
from spacy.vectors import Vectors
from spacy.strings import StringStore
from nltk.corpus import stopwords

import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

%matplotlib inline

In [2]:
os.getcwd()

'/home/ryohayama/Python/b13logy/nlp'

In [3]:
dictionary = corpora.Dictionary.load('serialized_data/nlp09_lemmatized_corpus.dict')
corpus = corpora.MmCorpus('serialized_data/nlp09_bow_corpus.mm')

In [4]:
print(corpus)

MmCorpus(13434 documents, 27077 features, 1270334 non-zero entries)


In [5]:
# Set up a tf-idf transforming model using the corpus
tfidf_model = models.TfidfModel(corpus)

In [6]:
print(tfidf_model)

TfidfModel(num_docs=13434, num_nnz=1270334)


In [7]:
# Transform corpus into tf-idf space
corpus_tfidf = tfidf_model[corpus]

In [8]:
len(corpus_tfidf)

13434

In [9]:
num_topics=7

In [10]:
%%time
# Transform tf-idf corpus with Latent Semantic Analysis
lsi = models.LsiModel(
            corpus=corpus_tfidf, 
            id2word=dictionary, 
            num_topics=num_topics
)

CPU times: user 9.22 s, sys: 3.77 s, total: 13 s
Wall time: 4.44 s


In [11]:
lsi.print_topics(5)

[(0,
  '0.166*"ms" + 0.156*"mm" + 0.154*"min" + 0.143*"search" + 0.134*"peptide" + 0.120*"set" + 0.110*"ion" + 0.108*"sample" + 0.107*"modification" + 0.105*"database"'),
 (1,
  '-0.223*"search" + 0.199*"mm" + 0.166*"min" + -0.162*"database" + -0.159*"modification" + -0.135*"tolerance" + -0.128*"maxquant" + -0.126*"set" + -0.119*"mascot" + -0.119*"version"'),
 (2,
  '0.224*"cell" + 0.124*"-pron-" + 0.120*"study" + 0.118*"proteomic" + 0.108*"identify" + 0.099*"cancer" + 0.096*"change" + -0.096*"mascot" + 0.095*"expression" + 0.094*"protein"'),
 (3,
  '-0.250*"maxquant" + -0.245*"mm" + 0.200*"ms" + -0.144*"°" + 0.139*"mascot" + 0.133*"ion" + 0.124*"scan" + -0.118*"c" + -0.117*"andromeda" + 0.115*"orbitrap"'),
 (4,
  '-0.364*"maxquant" + 0.316*"mascot" + -0.158*"andromeda" + 0.144*"discoverer" + -0.137*"lfq" + 0.133*"mm" + 0.130*"science" + -0.127*"quantification" + 0.125*"matrix" + -0.115*"perseus"')]

In [12]:
corpus_lsi = lsi[corpus_tfidf]

In [13]:
len(corpus_lsi)

13434

In [14]:
for doc in corpus_lsi[:3]:
    print(doc)

[(0, 0.2863634515116126), (1, 0.24171032600217704), (2, -0.030296234728777783), (3, -0.09261507952954949), (4, 0.06626770528972953), (5, 0.008721577056012847), (6, 0.0014781850908041408)]
[(0, 0.0730448113608419), (1, 0.02054618707884547), (2, 0.08762624888768519), (3, 0.04950189318669456), (4, 0.016338666018110827), (5, 0.09400272918582958), (6, -0.031015548784552045)]
[(0, 0.28498212278810114), (1, 0.23202019086935002), (2, -0.04583892369276718), (3, 0.042124672463871006), (4, -0.01181207677150549), (5, 0.12442584635941627), (6, -0.038407258929755085)]


In [15]:
lsi.save('serialized_data/nlp10_tfidf.lsi')

In [16]:
%%time
# Transform tf-idf corpus with Latent Semantic Analysis
lda = models.LdaModel(
            corpus=corpus_tfidf, 
            id2word=dictionary, 
            num_topics=num_topics
)

CPU times: user 7.58 s, sys: 57.6 ms, total: 7.64 s
Wall time: 7.65 s


In [24]:
lda.print_topics(7)

[(0,
  '0.003*"cho" + 0.001*"ir" + 0.001*"podocyte" + 0.001*"gp120" + 0.001*"ppi" + 0.001*"synaptic" + 0.001*"giardia" + 0.001*"gpi" + 0.001*"akt1" + 0.001*"il-33"'),
 (1,
  '0.004*"cell" + 0.003*"stress" + 0.003*"host" + 0.003*"response" + 0.003*"plant" + 0.003*"reveal" + 0.002*"resistance" + 0.002*"complex" + 0.002*"change" + 0.002*"mutant"'),
 (2,
  '0.003*"novel" + 0.003*"assembly" + 0.002*"metabolism" + 0.002*"reveal" + 0.002*"inflammatory" + 0.002*"transport" + 0.002*"mitochondrial" + 0.002*"gene" + 0.002*"disease" + 0.002*"methylation"'),
 (3,
  '0.002*"cell" + 0.002*"glycosylation" + 0.002*"redox" + 0.002*"glycopeptide" + 0.002*"glycan" + 0.002*"understand" + 0.002*"proteomic" + 0.002*"temporal" + 0.002*"tissue" + 0.002*"ovarian"'),
 (4,
  '0.004*"search" + 0.003*"protein" + 0.003*"identify" + 0.003*"modification" + 0.003*"-pron-" + 0.003*"database" + 0.003*"cell" + 0.003*"ms" + 0.003*"proteome" + 0.003*"peptide"'),
 (5,
  '0.003*"cll" + 0.003*"platelet" + 0.002*"hsp90" + 0.002

In [18]:
corpus_lda = lda[corpus_tfidf]

In [19]:
len(corpus_lda)

13434

In [20]:
for doc in corpus_lda[:3]:
    print(doc)

[(0, 0.0122563355), (1, 0.012730888), (2, 0.012258478), (3, 0.012326433), (4, 0.5642195), (5, 0.37391886), (6, 0.012289526)]
[(0, 0.025604108), (1, 0.025692), (2, 0.4645122), (3, 0.40713558), (4, 0.025745993), (5, 0.025656058), (6, 0.025654022)]
[(0, 0.016169365), (1, 0.016245287), (2, 0.016243411), (3, 0.016236937), (4, 0.016260434), (5, 0.9026532), (6, 0.016191347)]


In [21]:
lda.save('serialized_data/nlp10_tfidf_%d.lda' % num_topics)

In [22]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [23]:
pyLDAvis.gensim.prepare(lda, corpus_tfidf[:1000], dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
