In [1]:
import pandas as pd

data = pd.read_csv('title_abstract.csv', error_bad_lines=False);
data_text = data[['text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
len(documents)

3982

In [3]:
documents[:5]

Unnamed: 0,text,index
0,A strategy for managing content complexity in ...,0
1,Efficient passage ranking for document databas...,1
2,The aditi deductive database system:Deductive ...,2
3,Housekeeping for prefix coding:We consider the...,3
4,Memory efficient ranking:Fast and effective ra...,4


### Data Preprocessing

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2019)

In [5]:
import nltk
# nltk.download('wordnet')

#### Lemmatize example

In [6]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


#### Stemmer Example

In [7]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [8]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = documents[documents['index'] == 200].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Using', 'emerging', 'patterns', 'and', 'decision', 'trees', 'in', 'rare-class', 'classification:The', 'problem', 'of', 'classifying', 'rarely', 'occurring', 'cases', 'is', 'faced', 'in', 'many', 'real', 'life', 'applications.', 'The', 'scarcity', 'of', 'the', 'rare', 'cases', 'makes', 'it', 'difficult', 'to', 'classify', 'them', 'correctly', 'using', 'traditional', 'classifiers.', 'In', 'this', 'paper,', 'we', 'propose', 'a', 'new', 'approach', 'to', 'use', 'emerging', 'patterns', '(EPs)', 'and', 'decision', 'trees', '(DTs)', 'in', 'rare-class', 'classification', '(EPDT).', 'EPs', 'are', 'those', 'itemsets', 'whose', 'supports', 'in', 'one', 'class', 'are', 'significantly', 'higher', 'than', 'their', 'supports', 'in', 'the', 'other', 'classes.', 'EPDT', 'employs', 'the', 'power', 'of', 'EPs', 'to', 'improve', 'the', 'quality', 'of', 'rare-case', 'classification.', 'To', 'achieve', 'this', 'aim,', 'we', 'first', 'introduce', 'the', 'idea', 'of', 'generating', 'new'

In [10]:
processed_docs = documents['text'].map(preprocess)

In [43]:
processed_docs[10:20]

10    [efficient, consumer, response, survey, austra...
11    [binary, interpolative, cod, effective, index,...
12    [empirical, evaluation, cod, methods, multi, s...
13    [fast, algorithm, meld, splay, tree, springer,...
14    [efficient, object, orient, program, prolog, d...
15    [optimal, dynamic, multi, attribute, hash, ran...
16    [determinism, functional, languages, introduct...
17    [efficient, computation, query, stratify, data...
18    [share, groundness, dependencies, logic, progr...
19    [linear, arboricity, linear, arboricity, regul...
Name: text, dtype: object

### Bag of words on the dataset

In [11]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [45]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break

0 accompany
1 action
2 address
3 advantage
4 algorithm
5 algorithms
6 allow
7 animation
8 availability
9 call
10 capture
11 complexity
12 content
13 control
14 coordinate
15 correspond
16 correspondingly
17 data
18 description
19 detail
20 different


In [12]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [13]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 3),
 (5, 2),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 3),
 (11, 2),
 (12, 2),
 (13, 1),
 (14, 1),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 2),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 2),
 (30, 2),
 (31, 1),
 (32, 5),
 (33, 1),
 (34, 3),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 2),
 (49, 2),
 (50, 2),
 (51, 1),
 (52, 1),
 (53, 2),
 (54, 1),
 (55, 2),
 (56, 1),
 (57, 1),
 (58, 2)]

In [16]:
bow_doc_200 = bow_corpus[200]

for i in range(len(bow_doc_200)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_200[i][0], 
                                                     dictionary[bow_doc_200[i][0]], 
                                                     bow_doc_200[i][1]))

Word 52 ("support") appears 2 time.
Word 82 ("improv") appears 1 time.
Word 89 ("method") appears 1 time.
Word 125 ("power") appears 1 time.
Word 134 ("applic") appears 1 time.
Word 143 ("ieee") appears 1 time.
Word 153 ("problem") appears 1 time.
Word 154 ("propos") appears 1 time.
Word 169 ("experi") appears 1 time.
Word 179 ("make") appears 1 time.
Word 192 ("achiev") appears 1 time.
Word 225 ("generat") appears 1 time.
Word 227 ("idea") appears 1 time.
Word 244 ("signific") appears 1 time.
Word 313 ("difficult") appears 1 time.
Word 368 ("import") appears 1 time.
Word 370 ("introduc") appears 1 time.
Word 382 ("class") appears 6 time.
Word 407 ("emerg") appears 2 time.
Word 414 ("tree") appears 2 time.
Word 416 ("correct") appears 1 time.
Word 460 ("approach") appears 1 time.
Word 473 ("decis") appears 2 time.
Word 557 ("occur") appears 1 time.
Word 582 ("pattern") appears 2 time.
Word 624 ("case") appears 3 time.
Word 630 ("qualiti") appears 1 time.
Word 644 ("instanc") appears 2 

### TF-IDF

In [14]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [15]:
corpus_tfidf = tfidf[bow_corpus]

In [16]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.13902024201089283),
 (1, 0.08738507861257963),
 (2, 0.05560774368588772),
 (3, 0.0791816638102688),
 (4, 0.14352583658639353),
 (5, 0.10910353492379238),
 (6, 0.05162349826212706),
 (7, 0.0936195163175166),
 (8, 0.06745830242255361),
 (9, 0.07298609187290764),
 (10, 0.2379756189349876),
 (11, 0.16574354387081996),
 (12, 0.12768481932381914),
 (13, 0.10646707892303645),
 (14, 0.09491313051487732),
 (15, 0.05240028255162777),
 (16, 0.11365800280661265),
 (17, 0.0879846057230066),
 (18, 0.041019228725238055),
 (19, 0.06588179592755974),
 (20, 0.12541250036660845),
 (21, 0.06026441641299527),
 (22, 0.13131851031809763),
 (23, 0.22622010416689314),
 (24, 0.27512783682281844),
 (25, 0.117218959126369),
 (26, 0.1281947592721971),
 (27, 0.12723318377348622),
 (28, 0.08965594094958554),
 (29, 0.1071803839263706),
 (30, 0.11865616974829668),
 (31, 0.05823551306319199),
 (32, 0.2763476276719661),
 (33, 0.0791816638102688),
 (34, 0.24861531580622995),
 (35, 0.11597584787241458),
 (36, 0.068

### Running LDA using Bag of Words

In [27]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [19]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.015*"data" + 0.012*"cloud" + 0.011*"security" + 0.009*"service" + 0.008*"base" + 0.007*"model" + 0.007*"time" + 0.007*"propose" + 0.007*"compute" + 0.006*"approach"
Topic: 1 
Words: 0.015*"data" + 0.008*"design" + 0.008*"model" + 0.008*"social" + 0.007*"study" + 0.007*"research" + 0.007*"network" + 0.006*"public" + 0.006*"provide" + 0.006*"information"
Topic: 2 
Words: 0.019*"model" + 0.015*"base" + 0.013*"approach" + 0.012*"process" + 0.011*"data" + 0.008*"user" + 0.007*"propose" + 0.007*"users" + 0.007*"information" + 0.007*"structure"
Topic: 3 
Words: 0.012*"network" + 0.010*"design" + 0.008*"base" + 0.008*"model" + 0.007*"result" + 0.007*"data" + 0.007*"propose" + 0.007*"cloud" + 0.007*"systems" + 0.006*"information"
Topic: 4 
Words: 0.019*"model" + 0.011*"process" + 0.010*"base" + 0.009*"data" + 0.008*"analysis" + 0.008*"information" + 0.008*"result" + 0.007*"study" + 0.007*"approach" + 0.006*"present"
Topic: 5 
Words: 0.013*"model" + 0.009*"result" + 0.008*"cod

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [33]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=15, id2word=dictionary, passes=2, workers=4)

In [41]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"vote" + 0.004*"peer" + 0.004*"document" + 0.003*"attack" + 0.003*"network" + 0.003*"query" + 0.003*"data" + 0.003*"security" + 0.003*"information" + 0.003*"design"
Topic: 1 Word: 0.006*"health" + 0.004*"data" + 0.003*"privacy" + 0.003*"process" + 0.003*"service" + 0.003*"model" + 0.003*"information" + 0.003*"sequence" + 0.003*"compression" + 0.003*"network"
Topic: 2 Word: 0.003*"sensor" + 0.003*"document" + 0.003*"data" + 0.003*"score" + 0.003*"tree" + 0.003*"network" + 0.003*"sort" + 0.003*"query" + 0.003*"sequence" + 0.003*"information"
Topic: 3 Word: 0.007*"cloud" + 0.004*"data" + 0.004*"service" + 0.004*"energy" + 0.003*"cluster" + 0.003*"model" + 0.003*"text" + 0.003*"compute" + 0.003*"query" + 0.003*"network"
Topic: 4 Word: 0.004*"model" + 0.004*"process" + 0.004*"data" + 0.004*"pattern" + 0.003*"document" + 0.003*"protein" + 0.003*"test" + 0.003*"mine" + 0.003*"methods" + 0.003*"agent"
Topic: 5 Word: 0.007*"query" + 0.004*"data" + 0.004*"cloud" + 0.003*"clu

### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [39]:
i = 4

def docText(documents,index):
    text = documents[documents['index'] == index].values[0][0]
    return text
print(bow_corpus[i])
print(docText(documents,i))
for index, score in sorted(lda_model_tfidf[bow_corpus[i]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 20)))


[(2, 1), (6, 1), (21, 1), (42, 1), (48, 2), (62, 1), (63, 1), (64, 1), (72, 6), (73, 1), (74, 1), (86, 1), (90, 5), (93, 2), (94, 5), (96, 2), (99, 1), (104, 1), (110, 1), (129, 1), (166, 1), (167, 1), (168, 1), (169, 1), (170, 1), (171, 3), (172, 1), (173, 1), (174, 1), (175, 1), (176, 2), (177, 1), (178, 1), (179, 2), (180, 1), (181, 1), (182, 1), (183, 1), (184, 1), (185, 1), (186, 2), (187, 1), (188, 3), (189, 1), (190, 1), (191, 2), (192, 1), (193, 1), (194, 1), (195, 1), (196, 1), (197, 2), (198, 1), (199, 1), (200, 1), (201, 1), (202, 1)]
Memory efficient ranking:Fast and effective ranking of a collection of documents with respect to a query requires several structures, including a vocabulary, inverted file entries, arrays of term weights and document lengths, a set of partial similarity accumulators, and address tables for inverted file entries and documents. Of all of these structures, the array of document lengths and the set of accumulators are the components accessed most f

In [66]:
print(bow_corpus[100])

[(15, 1), (42, 2), (48, 2), (67, 1), (84, 1), (96, 1), (103, 1), (143, 1), (160, 1), (193, 1), (199, 1), (204, 2), (262, 3), (312, 1), (317, 1), (348, 2), (349, 1), (414, 1), (435, 4), (436, 1), (456, 1), (483, 1), (545, 1), (678, 1), (864, 2), (918, 1), (939, 1), (940, 1), (968, 1), (970, 2), (1081, 1), (1210, 1), (1211, 1), (1212, 1), (1213, 1)]


In [67]:
for index, score in sorted(lda_model[bow_corpus[100]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4900451898574829	 
Topic: 0.020*"queri" + 0.019*"model" + 0.013*"method" + 0.010*"propos" + 0.010*"base" + 0.009*"network" + 0.009*"result" + 0.008*"data" + 0.007*"user" + 0.007*"time"

Score: 0.3876887261867523	 
Topic: 0.016*"applic" + 0.015*"resourc" + 0.011*"comput" + 0.011*"model" + 0.010*"cloud" + 0.009*"perform" + 0.009*"base" + 0.008*"process" + 0.008*"propos" + 0.008*"data"

Score: 0.10735906660556793	 
Topic: 0.019*"cloud" + 0.017*"servic" + 0.010*"resourc" + 0.009*"model" + 0.009*"user" + 0.009*"research" + 0.009*"provid" + 0.008*"comput" + 0.008*"data" + 0.008*"inform"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [24]:
for index, score in sorted(lda_model_tfidf[bow_corpus[3981]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.42658671736717224	 
Topic: 0.006*"cloud" + 0.005*"process" + 0.005*"data" + 0.004*"model" + 0.004*"document"

Score: 0.3463570177555084	 
Topic: 0.004*"data" + 0.003*"model" + 0.003*"network" + 0.003*"query" + 0.003*"process"

Score: 0.1227799579501152	 
Topic: 0.006*"cod" + 0.004*"cloud" + 0.004*"function" + 0.004*"offload" + 0.004*"outlier"

Score: 0.05606284365057945	 
Topic: 0.010*"cloud" + 0.005*"social" + 0.004*"data" + 0.004*"service" + 0.004*"resource"

Score: 0.039035771042108536	 
Topic: 0.005*"cluster" + 0.004*"cloud" + 0.004*"algorithm" + 0.004*"service" + 0.004*"performance"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [29]:
# unseen_document = "machine learning"
unseen_document = "The use of randomness in the designing of the digital devices has been discussed. Qualities of randomness such as unpredictability, indeterminacy and unexpectedness have been used as a creative resource to generate innovative , output. Randomness is a creative tool to inspire and generate innovative outputs that is a means to an end. The growth of digital interactivity has been accompanied by a increasing amount of interactive that express certain qualities of randomness during use. An emergent approach toward randomness is to allow users to interact directly with the randomness. Shuffle listening, which is an alternative listening mode offered by digital music players, is a more sophisticated approach, whereby application of randomness has publicly captured by imagination of many people. Considerations, in determining where a random feature can be used, should include the types of content, the domain and contexts where these digital devices are used"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.3733212351799011	 Topic: 0.006*"mobile" + 0.005*"data" + 0.004*"social" + 0.004*"privacy" + 0.003*"research"
Score: 0.26307183504104614	 Topic: 0.006*"cloud" + 0.005*"process" + 0.005*"data" + 0.004*"model" + 0.004*"document"
Score: 0.16387544572353363	 Topic: 0.006*"game" + 0.005*"data" + 0.004*"design" + 0.003*"network" + 0.003*"mutations"
Score: 0.11993960291147232	 Topic: 0.005*"cluster" + 0.004*"model" + 0.004*"data" + 0.004*"program" + 0.003*"network"
Score: 0.06462649255990982	 Topic: 0.004*"data" + 0.003*"model" + 0.003*"network" + 0.003*"query" + 0.003*"process"
