In [2]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
text_data_sentence = pd.read_csv('textdatanew.csv', encoding='ISO-8859-1')

In [4]:
text_data_sentence.head()

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


In [5]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
doc_sample= text_data_sentence.ParagraphText.iloc[0]

In [7]:
stemmer = SnowballStemmer('english')
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['A', 'FRENCH', 'philosopher,', 'moralizing', 'on', 'the', 'great', 'influence', 'of', 'little', 'matters,', 'remarked', 'that', 'a', 'fraction', 'of', 'an', 'inch', 'more', 'on', 'the', 'end', 'of', "Cleopatra's", 'nose', 'would', 'have', 'changed', 'the', 'history', 'of', 'Rome', 'and', 'Egypt.', 'As', 'it', 'was,', 'her', 'unblemished', 'beauty,', 'her', 'wit,', 'and', 'her', 'audacity', 'disarmed', 'two', 'of', 'the', 'greatest', 'generals', 'Rome', 'ever', 'sent', 'into', 'Egypt.', 'Not', 'until', 'a', 'third', 'remained', 'oblivious', 'to', 'the', 'charms', 'she', 'temptingly', 'displayed', 'to', 'him', 'did', 'she', 'abandon', 'her', 'effort', 'to', 'rule', 'the', 'world', 'by', 'beauty,', 'and', 'seek', 'refuge', 'in', 'self-inflicted', 'death.']


 tokenized and lemmatized document: 
['french', 'philosoph', 'moral', 'great', 'influenc', 'littl', 'matter', 'remark', 'fraction', 'inch', 'cleopatra', 'nose', 'chang', 'histori', 'rome', 'egypt', 'unblemish', 'b

In [8]:
stemmer.stem('running')

'run'

In [9]:
len(text_data_sentence)

16637

In [10]:
processed_text = text_data_sentence['ParagraphText'].map(preprocess)
processed_text[:10]

0    [french, philosoph, moral, great, influenc, li...
1    [cleopatra, joint, heir, throne, egypt, younge...
2    [cleopatra, respond, brilliant, retinu, send, ...
3    [caesar, fifti, year, life, soldier, fight, ma...
4    [year, cleopatra, reign, littl, troubl, egypt,...
5    [antoni, amor, dallianc, keep, caesar, alexand...
6    [anecdot, characterist, stori, reach, rome, pe...
7    [cleopatra, take, refug, massiv, mausoleum, bu...
8    [octavius, cold, ambiti, desir, save, cleopatr...
9    [stew, great, copper, bathtub, shape, like, wo...
Name: ParagraphText, dtype: object

In [11]:
dictionary = gensim.corpora.Dictionary(processed_text)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abandon
1 audac
2 beauti
3 chang
4 charm
5 cleopatra
6 death
7 disarm
8 display
9 effort
10 egypt


In [12]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [13]:
bow_corpus = [dictionary.doc2bow(para) for para in processed_text]
bow_corpus[4310]

[(33, 1),
 (183, 1),
 (555, 1),
 (663, 1),
 (671, 1),
 (906, 1),
 (933, 1),
 (940, 1),
 (1181, 1),
 (1348, 1),
 (1527, 1),
 (1822, 1),
 (3397, 1),
 (4065, 1),
 (4852, 1)]

In [14]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 33 ("world") appears 1 time.
Word 183 ("soldier") appears 1 time.
Word 555 ("establish") appears 1 time.
Word 663 ("england") appears 1 time.
Word 671 ("town") appears 1 time.
Word 906 ("call") appears 1 time.
Word 933 ("go") appears 1 time.
Word 940 ("mission") appears 1 time.
Word 1181 ("branch") appears 1 time.
Word 1348 ("plant") appears 1 time.
Word 1527 ("part") appears 1 time.
Word 1822 ("station") appears 1 time.
Word 3397 ("foreign") appears 1 time.
Word 4065 ("convert") appears 1 time.
Word 4852 ("aldershot") appears 1 time.


In [15]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.17982145990951667),
 (1, 0.22678241597781135),
 (2, 0.18208101011145128),
 (3, 0.12593951493038993),
 (4, 0.12041560749117791),
 (5, 0.1656402864225709),
 (6, 0.09287340221089282),
 (7, 0.2408395494152027),
 (8, 0.17400117160089892),
 (9, 0.13328396766128953),
 (10, 0.37183888203342846),
 (11, 0.12950483715460168),
 (12, 0.12164659444314399),
 (13, 0.07066563655542578),
 (14, 0.14471623607418466),
 (15, 0.1350827711577539),
 (16, 0.21790942168216243),
 (17, 0.21177538320627756),
 (18, 0.12206683179289007),
 (19, 0.07136049726362763),
 (20, 0.1275523246499251),
 (21, 0.13991794423212425),
 (22, 0.19171106528329587),
 (23, 0.19997657445410555),
 (24, 0.18504169396015993),
 (25, 0.11826640835047106),
 (26, 0.11975015218826524),
 (27, 0.31385485417075476),
 (28, 0.14351833413957676),
 (29, 0.12985592022439799),
 (30, 0.12143837065284914),
 (31, 0.09940139023405369),
 (32, 0.1974073135423591),
 (33, 0.097720348862514)]


### Running LDA using Bag of Words

In [19]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)

In [20]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.010*"letter" + 0.009*"write" + 0.007*"say" + 0.006*"love" + 0.006*"go" + 0.006*"mari" + 0.006*"miss" + 0.005*"clark" + 0.005*"know" + 0.005*"leav"
Topic: 1 
Words: 0.007*"woman" + 0.006*"year" + 0.006*"say" + 0.006*"love" + 0.005*"littl" + 0.005*"like" + 0.005*"time" + 0.005*"give" + 0.004*"bell" + 0.004*"sister"
Topic: 2 
Words: 0.008*"lola" + 0.007*"ladi" + 0.007*"time" + 0.006*"say" + 0.005*"young" + 0.005*"come" + 0.005*"year" + 0.005*"love" + 0.005*"littl" + 0.005*"work"
Topic: 3 
Words: 0.008*"life" + 0.007*"year" + 0.007*"come" + 0.006*"time" + 0.006*"love" + 0.005*"go" + 0.005*"say" + 0.005*"take" + 0.005*"long" + 0.005*"littl"
Topic: 4 
Words: 0.006*"come" + 0.005*"year" + 0.005*"know" + 0.004*"time" + 0.004*"read" + 0.004*"work" + 0.004*"hous" + 0.004*"women" + 0.004*"life" + 0.004*"ladi"
Topic: 5 
Words: 0.007*"time" + 0.007*"year" + 0.006*"king" + 0.006*"lola" + 0.006*"carew" + 0.005*"go" + 0.005*"come" + 0.005*"write" + 0.004*"great" + 0.004*"leav"


### Running LDA using TFIDF

In [18]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"king" + 0.003*"know" + 0.003*"come" + 0.003*"go" + 0.002*"love" + 0.002*"like" + 0.002*"thou" + 0.002*"say" + 0.002*"woman" + 0.002*"write"
Topic: 1 Word: 0.003*"love" + 0.003*"say" + 0.003*"time" + 0.003*"come" + 0.002*"know" + 0.002*"year" + 0.002*"go" + 0.002*"littl" + 0.002*"miss" + 0.002*"like"
Topic: 2 Word: 0.003*"sister" + 0.003*"come" + 0.003*"year" + 0.003*"life" + 0.003*"love" + 0.002*"time" + 0.002*"great" + 0.002*"say" + 0.002*"know" + 0.002*"go"
Topic: 3 Word: 0.003*"say" + 0.003*"lola" + 0.002*"love" + 0.002*"time" + 0.002*"come" + 0.002*"year" + 0.002*"work" + 0.002*"life" + 0.002*"write" + 0.002*"woman"
Topic: 4 Word: 0.003*"year" + 0.003*"life" + 0.003*"write" + 0.003*"love" + 0.003*"work" + 0.003*"time" + 0.002*"friend" + 0.002*"say" + 0.002*"great" + 0.002*"woman"
