In [1]:
import pandas as pd

data = pd.read_csv('title_abstract.csv', error_bad_lines=False);
data_text = data[['text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
len(documents)

3982

In [3]:
documents[:5]

Unnamed: 0,text,index
0,A strategy for managing content complexity in ...,0
1,Efficient passage ranking for document databas...,1
2,The aditi deductive database system:Deductive ...,2
3,Housekeeping for prefix coding:We consider the...,3
4,Memory efficient ranking:Fast and effective ra...,4


### Data Preprocessing

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2019)

In [5]:
import nltk
# nltk.download('wordnet')

#### Lemmatize example

In [6]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


#### Stemmer Example

In [7]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [55]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = documents[documents['index'] == 200].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Using', 'emerging', 'patterns', 'and', 'decision', 'trees', 'in', 'rare-class', 'classification:The', 'problem', 'of', 'classifying', 'rarely', 'occurring', 'cases', 'is', 'faced', 'in', 'many', 'real', 'life', 'applications.', 'The', 'scarcity', 'of', 'the', 'rare', 'cases', 'makes', 'it', 'difficult', 'to', 'classify', 'them', 'correctly', 'using', 'traditional', 'classifiers.', 'In', 'this', 'paper,', 'we', 'propose', 'a', 'new', 'approach', 'to', 'use', 'emerging', 'patterns', '(EPs)', 'and', 'decision', 'trees', '(DTs)', 'in', 'rare-class', 'classification', '(EPDT).', 'EPs', 'are', 'those', 'itemsets', 'whose', 'supports', 'in', 'one', 'class', 'are', 'significantly', 'higher', 'than', 'their', 'supports', 'in', 'the', 'other', 'classes.', 'EPDT', 'employs', 'the', 'power', 'of', 'EPs', 'to', 'improve', 'the', 'quality', 'of', 'rare-case', 'classification.', 'To', 'achieve', 'this', 'aim,', 'we', 'first', 'introduce', 'the', 'idea', 'of', 'generating', 'new'

In [56]:
processed_docs = documents['text'].map(preprocess)

In [43]:
processed_docs[10:20]

10    [efficient, consumer, response, survey, austra...
11    [binary, interpolative, cod, effective, index,...
12    [empirical, evaluation, cod, methods, multi, s...
13    [fast, algorithm, meld, splay, tree, springer,...
14    [efficient, object, orient, program, prolog, d...
15    [optimal, dynamic, multi, attribute, hash, ran...
16    [determinism, functional, languages, introduct...
17    [efficient, computation, query, stratify, data...
18    [share, groundness, dependencies, logic, progr...
19    [linear, arboricity, linear, arboricity, regul...
Name: text, dtype: object

### Bag of words on the dataset

In [57]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [58]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break

0 accompani
1 action
2 address
3 advantag
4 algorithm
5 allow
6 anim
7 avail
8 call
9 captur
10 complex
11 content
12 control
13 coordin
14 correspond
15 data
16 descript
17 detail
18 differ
19 dofferem
20 dynam


In [59]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [60]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 5),
 (5, 1),
 (6, 4),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 3),
 (11, 2),
 (12, 2),
 (13, 1),
 (14, 2),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 2),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 2),
 (30, 2),
 (31, 1),
 (32, 5),
 (33, 1),
 (34, 3),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 2),
 (50, 4),
 (51, 1),
 (52, 1),
 (53, 2),
 (54, 1),
 (55, 2),
 (56, 1),
 (57, 1),
 (58, 2)]

In [16]:
bow_doc_200 = bow_corpus[200]

for i in range(len(bow_doc_200)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_200[i][0], 
                                                     dictionary[bow_doc_200[i][0]], 
                                                     bow_doc_200[i][1]))

Word 52 ("support") appears 2 time.
Word 82 ("improv") appears 1 time.
Word 89 ("method") appears 1 time.
Word 125 ("power") appears 1 time.
Word 134 ("applic") appears 1 time.
Word 143 ("ieee") appears 1 time.
Word 153 ("problem") appears 1 time.
Word 154 ("propos") appears 1 time.
Word 169 ("experi") appears 1 time.
Word 179 ("make") appears 1 time.
Word 192 ("achiev") appears 1 time.
Word 225 ("generat") appears 1 time.
Word 227 ("idea") appears 1 time.
Word 244 ("signific") appears 1 time.
Word 313 ("difficult") appears 1 time.
Word 368 ("import") appears 1 time.
Word 370 ("introduc") appears 1 time.
Word 382 ("class") appears 6 time.
Word 407 ("emerg") appears 2 time.
Word 414 ("tree") appears 2 time.
Word 416 ("correct") appears 1 time.
Word 460 ("approach") appears 1 time.
Word 473 ("decis") appears 2 time.
Word 557 ("occur") appears 1 time.
Word 582 ("pattern") appears 2 time.
Word 624 ("case") appears 3 time.
Word 630 ("qualiti") appears 1 time.
Word 644 ("instanc") appears 2 

### TF-IDF

In [61]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [62]:
corpus_tfidf = tfidf[bow_corpus]

In [63]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.126376808266365),
 (1, 0.07787565055114672),
 (2, 0.0505504023030399),
 (3, 0.07096209915928124),
 (4, 0.1748251902363385),
 (5, 0.04688443567726373),
 (6, 0.4359943602628918),
 (7, 0.05340127524193944),
 (8, 0.061323191701547604),
 (9, 0.06634824688343606),
 (10, 0.15057628619104355),
 (11, 0.14691563847129344),
 (12, 0.11439273585975188),
 (13, 0.09021621346066433),
 (14, 0.16542145878952677),
 (15, 0.04763464920893705),
 (16, 0.09262477331378463),
 (17, 0.07998269523208633),
 (18, 0.032805413275128516),
 (19, 0.0522628878867586),
 (20, 0.09326819024973004),
 (21, 0.03370998865714734),
 (22, 0.11400664596594824),
 (23, 0.20564612977768015),
 (24, 0.23875104747687836),
 (25, 0.10655828035125481),
 (26, 0.10185786188827148),
 (27, 0.11566174420557522),
 (28, 0.08111274960634392),
 (29, 0.08318625106719792),
 (30, 0.1078647813944015),
 (31, 0.052939184698755956),
 (32, 0.2512147198997714),
 (33, 0.06634824688343606),
 (34, 0.1514966249528634),
 (35, 0.10233543592587072),
 (36, 0.

### Running LDA using Bag of Words

In [64]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [65]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.013*"data" + 0.012*"approach" + 0.010*"detect" + 0.010*"event" + 0.009*"base" + 0.009*"time" + 0.008*"program" + 0.008*"analysi" + 0.008*"propos" + 0.008*"applic"
Topic: 1 
Words: 0.010*"propos" + 0.009*"agent" + 0.009*"method" + 0.008*"document" + 0.008*"similar" + 0.008*"game" + 0.008*"term" + 0.007*"effect" + 0.007*"model" + 0.007*"algorithm"
Topic: 2 
Words: 0.022*"inform" + 0.015*"user" + 0.011*"data" + 0.009*"provid" + 0.009*"research" + 0.008*"evalu" + 0.008*"base" + 0.007*"network" + 0.007*"collect" + 0.007*"search"
Topic: 3 
Words: 0.019*"queri" + 0.018*"data" + 0.016*"process" + 0.014*"cloud" + 0.009*"propos" + 0.008*"time" + 0.008*"algorithm" + 0.008*"cost" + 0.008*"effici" + 0.007*"problem"
Topic: 4 
Words: 0.011*"data" + 0.011*"test" + 0.008*"base" + 0.008*"method" + 0.007*"analysi" + 0.007*"provid" + 0.007*"research" + 0.006*"cluster" + 0.006*"perform" + 0.006*"structur"
Topic: 5 
Words: 0.009*"servic" + 0.009*"evalu" + 0.009*"base" + 0.009*"time" + 0.0

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [66]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=15, id2word=dictionary, passes=2, workers=4)

In [67]:
for idx, topic in lda_model_tfidf.print_topics(-1,20):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"process" + 0.005*"model" + 0.004*"data" + 0.004*"servic" + 0.004*"graph" + 0.004*"cloud" + 0.004*"network" + 0.004*"queri" + 0.003*"secur" + 0.003*"problem" + 0.003*"research" + 0.003*"resourc" + 0.003*"base" + 0.003*"inform" + 0.003*"approach" + 0.003*"algorithm" + 0.003*"applic" + 0.003*"structur" + 0.003*"thing" + 0.003*"detect"
Topic: 1 Word: 0.007*"cloud" + 0.006*"detect" + 0.005*"energi" + 0.005*"data" + 0.004*"comput" + 0.004*"servic" + 0.004*"anomali" + 0.004*"resourc" + 0.003*"center" + 0.003*"network" + 0.003*"algorithm" + 0.003*"applic" + 0.003*"user" + 0.003*"method" + 0.003*"approach" + 0.003*"technolog" + 0.003*"research" + 0.003*"time" + 0.003*"queri" + 0.003*"attack"
Topic: 2 Word: 0.006*"agent" + 0.004*"social" + 0.004*"model" + 0.004*"process" + 0.003*"user" + 0.003*"test" + 0.003*"method" + 0.003*"data" + 0.003*"servic" + 0.003*"algorithm" + 0.003*"interact" + 0.003*"predict" + 0.003*"event" + 0.003*"visual" + 0.003*"studi" + 0.003*"detect" + 0.

### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [69]:
i = 500

def docText(documents,index):
    text = documents[documents['index'] == index].values[0][0]
    return text
print(docText(documents,i))
for index, score in sorted(lda_model_tfidf[bow_corpus[i]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index,30)))


Supporting grid-based clinical trials in Scotland:A computational infrastructure to underpin complex clinical trials and medical population studies is highly desirable. This should allow access to a range of distributed clinical data sets; support the efficient processing and analysis of the data obtained; have security at its heart; and ensure that authorized individuals are able to see privileged data and no more. Each clinical trial has its own requirements on data sets and how they are used; hence a reusable and flexible framework offers many advantages. The MRC funded Virtual Organisations for Trials and Epidemiological Studies (VOTES) is a collaborative project involving several UK universities specifically to explore this space. This article presents the experiences of developing the Scottish component of this nationwide infrastructure, by the National e-Science Centre (NeSC) based at the University of Glasgow, and the issues inherent in accessing and using the clinical data set

In [66]:
print(bow_corpus[100])

[(15, 1), (42, 2), (48, 2), (67, 1), (84, 1), (96, 1), (103, 1), (143, 1), (160, 1), (193, 1), (199, 1), (204, 2), (262, 3), (312, 1), (317, 1), (348, 2), (349, 1), (414, 1), (435, 4), (436, 1), (456, 1), (483, 1), (545, 1), (678, 1), (864, 2), (918, 1), (939, 1), (940, 1), (968, 1), (970, 2), (1081, 1), (1210, 1), (1211, 1), (1212, 1), (1213, 1)]


In [67]:
for index, score in sorted(lda_model[bow_corpus[100]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4900451898574829	 
Topic: 0.020*"queri" + 0.019*"model" + 0.013*"method" + 0.010*"propos" + 0.010*"base" + 0.009*"network" + 0.009*"result" + 0.008*"data" + 0.007*"user" + 0.007*"time"

Score: 0.3876887261867523	 
Topic: 0.016*"applic" + 0.015*"resourc" + 0.011*"comput" + 0.011*"model" + 0.010*"cloud" + 0.009*"perform" + 0.009*"base" + 0.008*"process" + 0.008*"propos" + 0.008*"data"

Score: 0.10735906660556793	 
Topic: 0.019*"cloud" + 0.017*"servic" + 0.010*"resourc" + 0.009*"model" + 0.009*"user" + 0.009*"research" + 0.009*"provid" + 0.008*"comput" + 0.008*"data" + 0.008*"inform"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [24]:
for index, score in sorted(lda_model_tfidf[bow_corpus[3981]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.42658671736717224	 
Topic: 0.006*"cloud" + 0.005*"process" + 0.005*"data" + 0.004*"model" + 0.004*"document"

Score: 0.3463570177555084	 
Topic: 0.004*"data" + 0.003*"model" + 0.003*"network" + 0.003*"query" + 0.003*"process"

Score: 0.1227799579501152	 
Topic: 0.006*"cod" + 0.004*"cloud" + 0.004*"function" + 0.004*"offload" + 0.004*"outlier"

Score: 0.05606284365057945	 
Topic: 0.010*"cloud" + 0.005*"social" + 0.004*"data" + 0.004*"service" + 0.004*"resource"

Score: 0.039035771042108536	 
Topic: 0.005*"cluster" + 0.004*"cloud" + 0.004*"algorithm" + 0.004*"service" + 0.004*"performance"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [29]:
# unseen_document = "machine learning"
unseen_document = "The use of randomness in the designing of the digital devices has been discussed. Qualities of randomness such as unpredictability, indeterminacy and unexpectedness have been used as a creative resource to generate innovative , output. Randomness is a creative tool to inspire and generate innovative outputs that is a means to an end. The growth of digital interactivity has been accompanied by a increasing amount of interactive that express certain qualities of randomness during use. An emergent approach toward randomness is to allow users to interact directly with the randomness. Shuffle listening, which is an alternative listening mode offered by digital music players, is a more sophisticated approach, whereby application of randomness has publicly captured by imagination of many people. Considerations, in determining where a random feature can be used, should include the types of content, the domain and contexts where these digital devices are used"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.3733212351799011	 Topic: 0.006*"mobile" + 0.005*"data" + 0.004*"social" + 0.004*"privacy" + 0.003*"research"
Score: 0.26307183504104614	 Topic: 0.006*"cloud" + 0.005*"process" + 0.005*"data" + 0.004*"model" + 0.004*"document"
Score: 0.16387544572353363	 Topic: 0.006*"game" + 0.005*"data" + 0.004*"design" + 0.003*"network" + 0.003*"mutations"
Score: 0.11993960291147232	 Topic: 0.005*"cluster" + 0.004*"model" + 0.004*"data" + 0.004*"program" + 0.003*"network"
Score: 0.06462649255990982	 Topic: 0.004*"data" + 0.003*"model" + 0.003*"network" + 0.003*"query" + 0.003*"process"
