In [4]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
path = '/Users/user/Documents/github/CBW/data/textdatanew.csv'

In [6]:
text_data_sentence = pd.read_csv(path , encoding='ISO-8859-1')

In [7]:
text_data_sentence.head()

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


In [8]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample= text_data_sentence.ParagraphText.iloc[0]

In [10]:
stemmer = SnowballStemmer('english')
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['A', 'FRENCH', 'philosopher,', 'moralizing', 'on', 'the', 'great', 'influence', 'of', 'little', 'matters,', 'remarked', 'that', 'a', 'fraction', 'of', 'an', 'inch', 'more', 'on', 'the', 'end', 'of', "Cleopatra's", 'nose', 'would', 'have', 'changed', 'the', 'history', 'of', 'Rome', 'and', 'Egypt.', 'As', 'it', 'was,', 'her', 'unblemished', 'beauty,', 'her', 'wit,', 'and', 'her', 'audacity', 'disarmed', 'two', 'of', 'the', 'greatest', 'generals', 'Rome', 'ever', 'sent', 'into', 'Egypt.', 'Not', 'until', 'a', 'third', 'remained', 'oblivious', 'to', 'the', 'charms', 'she', 'temptingly', 'displayed', 'to', 'him', 'did', 'she', 'abandon', 'her', 'effort', 'to', 'rule', 'the', 'world', 'by', 'beauty,', 'and', 'seek', 'refuge', 'in', 'self-inflicted', 'death.']


 tokenized and lemmatized document: 
['french', 'philosoph', 'moral', 'great', 'influenc', 'littl', 'matter', 'remark', 'fraction', 'inch', 'cleopatra', 'nose', 'chang', 'histori', 'rome', 'egypt', 'unblemish', 'b

In [11]:
stemmer.stem('running')

'run'

In [12]:
len(text_data_sentence)

16637

In [13]:
processed_text = text_data_sentence['ParagraphText'].map(preprocess)
processed_text[:10]

0    [french, philosoph, moral, great, influenc, li...
1    [cleopatra, joint, heir, throne, egypt, younge...
2    [cleopatra, respond, brilliant, retinu, send, ...
3    [caesar, fifti, year, life, soldier, fight, ma...
4    [year, cleopatra, reign, littl, troubl, egypt,...
5    [antoni, amor, dallianc, keep, caesar, alexand...
6    [anecdot, characterist, stori, reach, rome, pe...
7    [cleopatra, take, refug, massiv, mausoleum, bu...
8    [octavius, cold, ambiti, desir, save, cleopatr...
9    [stew, great, copper, bathtub, shape, like, wo...
Name: ParagraphText, dtype: object

In [14]:
dictionary = gensim.corpora.Dictionary(processed_text)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abandon
1 audac
2 beauti
3 chang
4 charm
5 cleopatra
6 death
7 disarm
8 display
9 effort
10 egypt


In [15]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [16]:
bow_corpus = [dictionary.doc2bow(para) for para in processed_text]
bow_corpus[4310]

[(33, 1),
 (183, 1),
 (555, 1),
 (663, 1),
 (671, 1),
 (906, 1),
 (933, 1),
 (940, 1),
 (1181, 1),
 (1348, 1),
 (1527, 1),
 (1822, 1),
 (3397, 1),
 (4065, 1),
 (4852, 1)]

In [17]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 33 ("world") appears 1 time.
Word 183 ("soldier") appears 1 time.
Word 555 ("establish") appears 1 time.
Word 663 ("england") appears 1 time.
Word 671 ("town") appears 1 time.
Word 906 ("call") appears 1 time.
Word 933 ("go") appears 1 time.
Word 940 ("mission") appears 1 time.
Word 1181 ("branch") appears 1 time.
Word 1348 ("plant") appears 1 time.
Word 1527 ("part") appears 1 time.
Word 1822 ("station") appears 1 time.
Word 3397 ("foreign") appears 1 time.
Word 4065 ("convert") appears 1 time.
Word 4852 ("aldershot") appears 1 time.


In [18]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.17982145990951667),
 (1, 0.22678241597781135),
 (2, 0.18208101011145128),
 (3, 0.12593951493038993),
 (4, 0.12041560749117791),
 (5, 0.1656402864225709),
 (6, 0.09287340221089282),
 (7, 0.2408395494152027),
 (8, 0.17400117160089892),
 (9, 0.13328396766128953),
 (10, 0.37183888203342846),
 (11, 0.12950483715460168),
 (12, 0.12164659444314399),
 (13, 0.07066563655542578),
 (14, 0.14471623607418466),
 (15, 0.1350827711577539),
 (16, 0.21790942168216243),
 (17, 0.21177538320627756),
 (18, 0.12206683179289007),
 (19, 0.07136049726362763),
 (20, 0.1275523246499251),
 (21, 0.13991794423212425),
 (22, 0.19171106528329587),
 (23, 0.19997657445410555),
 (24, 0.18504169396015993),
 (25, 0.11826640835047106),
 (26, 0.11975015218826524),
 (27, 0.31385485417075476),
 (28, 0.14351833413957676),
 (29, 0.12985592022439799),
 (30, 0.12143837065284914),
 (31, 0.09940139023405369),
 (32, 0.1974073135423591),
 (33, 0.097720348862514)]


### Running LDA using Bag of Words

In [19]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)

In [20]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"carew" + 0.007*"love" + 0.006*"year" + 0.006*"say" + 0.006*"write" + 0.006*"time" + 0.006*"know" + 0.005*"princ" + 0.005*"queen" + 0.005*"life"
Topic: 1 
Words: 0.018*"lola" + 0.008*"come" + 0.008*"go" + 0.005*"king" + 0.005*"love" + 0.005*"montez" + 0.005*"leav" + 0.004*"like" + 0.004*"long" + 0.004*"hous"
Topic: 2 
Words: 0.008*"ladi" + 0.007*"year" + 0.007*"good" + 0.006*"woman" + 0.006*"time" + 0.005*"littl" + 0.004*"write" + 0.004*"love" + 0.004*"say" + 0.004*"women"
Topic: 3 
Words: 0.011*"work" + 0.009*"sister" + 0.007*"year" + 0.007*"time" + 0.006*"great" + 0.006*"miss" + 0.006*"hospit" + 0.006*"dora" + 0.005*"write" + 0.005*"letter"
Topic: 4 
Words: 0.008*"year" + 0.006*"friend" + 0.006*"take" + 0.006*"come" + 0.006*"meet" + 0.005*"say" + 0.005*"go" + 0.004*"time" + 0.004*"know" + 0.004*"littl"
Topic: 5 
Words: 0.009*"life" + 0.007*"say" + 0.007*"beauti" + 0.007*"time" + 0.006*"love" + 0.006*"know" + 0.006*"come" + 0.006*"write" + 0.005*"littl" + 0.005*

### Running LDA using TFIDF

In [21]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"year" + 0.002*"write" + 0.002*"good" + 0.002*"time" + 0.002*"know" + 0.002*"life" + 0.002*"miss" + 0.002*"love" + 0.002*"say" + 0.002*"work"
Topic: 1 Word: 0.003*"sister" + 0.003*"dora" + 0.003*"time" + 0.003*"year" + 0.003*"love" + 0.003*"life" + 0.002*"work" + 0.002*"come" + 0.002*"hospit" + 0.002*"long"
Topic: 2 Word: 0.003*"year" + 0.003*"woman" + 0.002*"love" + 0.002*"say" + 0.002*"life" + 0.002*"come" + 0.002*"time" + 0.002*"ladi" + 0.002*"like" + 0.002*"young"
Topic: 3 Word: 0.003*"come" + 0.003*"say" + 0.003*"love" + 0.003*"time" + 0.002*"year" + 0.002*"life" + 0.002*"know" + 0.002*"work" + 0.002*"write" + 0.002*"great"
Topic: 4 Word: 0.003*"say" + 0.003*"love" + 0.002*"littl" + 0.002*"come" + 0.002*"year" + 0.002*"life" + 0.002*"know" + 0.002*"go" + 0.002*"king" + 0.002*"time"


In [22]:
processed_text[9]

['stew',
 'great',
 'copper',
 'bathtub',
 'shape',
 'like',
 'wooden',
 'shoe',
 'hop',
 'allevi',
 'tortur',
 'diseas',
 'pestilenti',
 'exist',
 'month',
 'marat',
 'self',
 'style',
 'friend',
 'peopl',
 'hand',
 'note',
 'young',
 'girl',
 'write',
 'bring',
 'news',
 'plot',
 'conspiraci',
 'republ',
 'caen']

In [23]:
bow_corpus[9]

[(13, 1),
 (30, 1),
 (79, 1),
 (139, 1),
 (154, 1),
 (156, 1),
 (159, 1),
 (171, 1),
 (238, 1),
 (284, 1),
 (309, 1),
 (350, 1),
 (351, 1),
 (352, 1),
 (353, 1),
 (354, 1),
 (355, 1),
 (356, 1),
 (357, 1),
 (358, 1),
 (359, 1),
 (360, 1),
 (361, 1),
 (362, 1),
 (363, 1),
 (364, 1)]

In [24]:
para_topics = [lda_model[each] for each in bow_corpus]

In [25]:
len(para_topics)

16637

In [26]:
len(text_data_sentence)

16637

In [27]:
para_topics = np.array(para_topics)

In [28]:
text_data_sentence['para_topics'] = para_topics

In [29]:
text_data_sentence.head()

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText,para_topics
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ...","[(1, 0.24319285), (2, 0.4540167), (5, 0.289499..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...,"[(1, 0.98286134)]"
2,a001,bio04,3,Cleopatra might have responded with a brillian...,"[(0, 0.28094766), (1, 0.6131604), (5, 0.099730..."
3,a001,bio04,4,Caesar was then above fifty years of age. His ...,"[(1, 0.7518664), (2, 0.2411164)]"
4,a001,bio04,5,For three years Cleopatra reigned with little ...,"[(1, 0.9342898), (4, 0.05846056)]"


In [44]:
#para_topics

topic_df = pd.DataFrame()

### Loop through the results of every paragraph
for index,topics in enumerate(para_topics):
    topic_df_temp = pd.DataFrame()
    
    ### For each paragraph
    for (topic,prob) in topics:
        topic_df_temp.loc[index,'Topic '+str(topic)] = prob
        #print(topic_df_temp)
        
    #### Storing the results for all the topics
    topic_df = pd.concat([topic_df,topic_df_temp],axis = 0)    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  from ipykernel import kernelapp as app


In [45]:
topic_df = topic_df.fillna(0)
topic_df.head()

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,0.0,0.243193,0.454017,0.0,0.0,0.2895
1,0.0,0.982861,0.0,0.0,0.0,0.0
2,0.280948,0.61316,0.0,0.0,0.0,0.09973
3,0.0,0.751866,0.241116,0.0,0.0,0.0
4,0.0,0.93429,0.0,0.0,0.058461,0.0


In [47]:
text_data_sentence_final = pd.concat([text_data_sentence,topic_df],axis = 1)

In [48]:
text_data_sentence_final.head()

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText,para_topics,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ...","[(1, 0.24319285), (2, 0.4540167), (5, 0.289499...",0.0,0.243193,0.454017,0.0,0.0,0.2895
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...,"[(1, 0.98286134)]",0.0,0.982861,0.0,0.0,0.0,0.0
2,a001,bio04,3,Cleopatra might have responded with a brillian...,"[(0, 0.28094766), (1, 0.6131604), (5, 0.099730...",0.280948,0.61316,0.0,0.0,0.0,0.09973
3,a001,bio04,4,Caesar was then above fifty years of age. His ...,"[(1, 0.7518664), (2, 0.2411164)]",0.0,0.751866,0.241116,0.0,0.0,0.0
4,a001,bio04,5,For three years Cleopatra reigned with little ...,"[(1, 0.9342898), (4, 0.05846056)]",0.0,0.93429,0.0,0.0,0.058461,0.0
