In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [16]:
data = pd.read_csv("data.tsv", delimiter= "\t")
title = list(data["title"])

In [18]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import PorterStemmer
import numpy as np
np.random.seed(2018)
import nltk

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [20]:
def lemmatize_stemming(text):
    return PorterStemmer().stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [21]:
sample = title[0]
print('original document: ')
words = []
for word in sample.split():
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(sample))

original document: 
['Novel', 'Hybrid', 'Propulsion', 'System', 'for', 'Sample', 'Return', 'Missions,', 'Phase', 'I']


 tokenized and lemmatized document: 
['novel', 'hybrid', 'propuls', 'sampl', 'return', 'mission', 'phase']


In [22]:
processed_docs = data['title'].map(preprocess)
processed_docs[:10]

0    [novel, hybrid, propuls, sampl, return, missio...
1      [phoenix, mar, meteorolog, pressur, temperatur]
2              [flash, lidaral, weathr, safeti, phase]
3    [level, almucantar, invers, product, phase, fu...
4    [collabor, outbound, taxi, meter, environment,...
5        [method, produc, launch, land, pad, structur]
6    [high, perform, miniatur, bandpass, filter, ph...
7                [comet, surfac, sampl, return, phase]
8    [regener, sorbent, combin, water, trace, conta...
9     [quantum, calorimet, base, hgcdte, alloy, phase]
Name: title, dtype: object

In [23]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 hybrid
1 mission
2 novel
3 phase
4 propuls
5 return
6 sampl
7 mar
8 meteorolog
9 phoenix
10 pressur


In [24]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [26]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]

In [30]:
bow_doc_4310 = bow_corpus[100]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

A Hardware/Software Design Environment for Reconfigurable Communication Systems, Phase I
Word 3 ("phase") appears 1 time.
Word 173 ("softwar") appears 1 time.
Word 195 ("reconfigur") appears 1 time.
Word 203 ("commun") appears 1 time.
Word 227 ("system") appears 1 time.
Word 257 ("environ") appears 1 time.
Word 306 ("design") appears 1 time.
Word 333 ("hardwar") appears 1 time.


In [31]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.40808197922233364),
 (1, 0.3539634277828854),
 (2, 0.3735843786208337),
 (3, 0.08770722615074826),
 (4, 0.36307211939500383),
 (5, 0.5102451453796853),
 (6, 0.41079151925530766)]


In [32]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [33]:
# using lda directly with word corpus
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.073*"phase" + 0.028*"modi" + 0.027*"time" + 0.018*"precipit" + 0.017*"real" + 0.015*"radiomet" + 0.014*"analysi" + 0.013*"base" + 0.010*"cloud" + 0.010*"data"
Topic: 1 
Words: 0.050*"phase" + 0.034*"level" + 0.028*"imag" + 0.020*"planetari" + 0.015*"gazett" + 0.015*"nomenclatur" + 0.013*"product" + 0.013*"measur" + 0.011*"radianc" + 0.011*"data"
Topic: 2 
Words: 0.078*"data" + 0.052*"orbit" + 0.036*"disc" + 0.031*"rosetta" + 0.025*"degre" + 0.021*"aura" + 0.020*"profil" + 0.019*"observ" + 0.017*"mean" + 0.017*"imag"
Topic: 3 
Words: 0.066*"phase" + 0.019*"space" + 0.018*"high" + 0.016*"technolog" + 0.013*"life" + 0.011*"laser" + 0.011*"develop" + 0.010*"encount" + 0.010*"power" + 0.009*"explor"
Topic: 4 
Words: 0.027*"aqua" + 0.024*"disc" + 0.023*"phase" + 0.021*"model" + 0.021*"degre" + 0.020*"grid" + 0.019*"global" + 0.019*"temperatur" + 0.018*"radiat" + 0.017*"daili"
Topic: 5 
Words: 0.110*"phase" + 0.027*"sensor" + 0.015*"optic" + 0.015*"lunar" + 0.014*"measur" +

In [34]:
# using lda with tfidf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.026*"phase" + 0.013*"high" + 0.009*"power" + 0.009*"control" + 0.008*"lightweight" + 0.008*"manag" + 0.007*"solar" + 0.007*"base" + 0.007*"advanc" + 0.007*"batteri"
Topic: 1 Word: 0.017*"data" + 0.016*"planetari" + 0.012*"nomenclatur" + 0.012*"gazett" + 0.012*"phase" + 0.009*"explor" + 0.008*"mar" + 0.008*"catalog" + 0.008*"dictionari" + 0.007*"releas"
Topic: 2 Word: 0.015*"phase" + 0.013*"high" + 0.013*"surfac" + 0.012*"modi" + 0.011*"global" + 0.010*"grid" + 0.009*"daili" + 0.009*"aqua" + 0.009*"soil" + 0.009*"terra"
Topic: 3 Word: 0.095*"podaac" + 0.019*"phase" + 0.013*"model" + 0.012*"imag" + 0.012*"nasa" + 0.007*"monitor" + 0.007*"space" + 0.007*"manufactur" + 0.006*"data" + 0.005*"tempel"
Topic: 4 Word: 0.018*"degre" + 0.015*"disc" + 0.014*"data" + 0.010*"phase" + 0.009*"grind" + 0.009*"time" + 0.008*"level" + 0.008*"mean" + 0.008*"valid" + 0.008*"scienc"
Topic: 5 Word: 0.016*"data" + 0.014*"phase" + 0.011*"aura" + 0.010*"level" + 0.008*"borea" + 0.007*"calipso" 

In [39]:
print(title[10])
for index, score in sorted(lda_model[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

Behavioral Monitoring and Evaluation for the Delivery of Interactive Cognitive Behavioral Therapy (B-MEDIC), Phase I

Score: 0.5008535385131836	 
Topic: 0.050*"phase" + 0.034*"level" + 0.028*"imag" + 0.020*"planetari" + 0.015*"gazett" + 0.015*"nomenclatur" + 0.013*"product" + 0.013*"measur" + 0.011*"radianc" + 0.011*"data"

Score: 0.41910529136657715	 
Topic: 0.066*"phase" + 0.019*"space" + 0.018*"high" + 0.016*"technolog" + 0.013*"life" + 0.011*"laser" + 0.011*"develop" + 0.010*"encount" + 0.010*"power" + 0.009*"explor"

Score: 0.010009195655584335	 
Topic: 0.110*"phase" + 0.027*"sensor" + 0.015*"optic" + 0.015*"lunar" + 0.014*"measur" + 0.011*"environ" + 0.009*"data" + 0.008*"magnet" + 0.008*"space" + 0.008*"carbon"

Score: 0.010006414726376534	 
Topic: 0.052*"podaac" + 0.036*"model" + 0.030*"grind" + 0.027*"version" + 0.026*"data" + 0.022*"valid" + 0.022*"nasa" + 0.018*"satellit" + 0.017*"test" + 0.016*"turbul"

Score: 0.010005177929997444	 
Topic: 0.119*"phase" + 0.039*"high" + 0.0

In [38]:
print(title[0])
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

Novel Hybrid Propulsion System for Sample Return Missions, Phase I

Score: 0.8874763250350952	 
Topic: 0.073*"phase" + 0.028*"modi" + 0.027*"time" + 0.018*"precipit" + 0.017*"real" + 0.015*"radiomet" + 0.014*"analysi" + 0.013*"base" + 0.010*"cloud" + 0.010*"data"

Score: 0.0125048803165555	 
Topic: 0.069*"phase" + 0.018*"array" + 0.016*"imag" + 0.015*"infrar" + 0.014*"flight" + 0.013*"deriv" + 0.013*"data" + 0.009*"model" + 0.009*"robot" + 0.009*"near"

Score: 0.012503189034759998	 
Topic: 0.050*"phase" + 0.034*"level" + 0.028*"imag" + 0.020*"planetari" + 0.015*"gazett" + 0.015*"nomenclatur" + 0.013*"product" + 0.013*"measur" + 0.011*"radianc" + 0.011*"data"

Score: 0.012502888217568398	 
Topic: 0.078*"data" + 0.052*"orbit" + 0.036*"disc" + 0.031*"rosetta" + 0.025*"degre" + 0.021*"aura" + 0.020*"profil" + 0.019*"observ" + 0.017*"mean" + 0.017*"imag"

Score: 0.01250247098505497	 
Topic: 0.044*"data" + 0.037*"global" + 0.026*"surfac" + 0.023*"project" + 0.021*"version" + 0.020*"terra" + 