In [16]:
from nltk.corpus import gutenberg
 
data = []
 
for fileid in gutenberg.fileids():
    document = ' '.join(gutenberg.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)


In [17]:
print(NO_DOCUMENTS)

18


In [18]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')


def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    stemmed = [porter.stem(word) for word in cleaned_text] 
    return stemmed
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))


   
 
#Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [12]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.015*"lord" + 0.014*"shall" + 0.014*"unto" + 0.010*"said" + 0.007*"one" + 0.007*"man" + 0.007*"thou" + 0.006*"thi" + 0.006*"god" + 0.006*"thee"
Topic #1: 0.012*"shall" + 0.011*"said" + 0.009*"thou" + 0.009*"unto" + 0.008*"lord" + 0.007*"one" + 0.006*"thi" + 0.006*"man" + 0.006*"god" + 0.006*"day"
Topic #2: 0.015*"shall" + 0.013*"lord" + 0.013*"unto" + 0.009*"said" + 0.008*"thou" + 0.007*"god" + 0.007*"thi" + 0.007*"thee" + 0.006*"day" + 0.005*"man"
Topic #3: 0.011*"shall" + 0.008*"said" + 0.008*"lord" + 0.008*"unto" + 0.006*"thou" + 0.006*"god" + 0.005*"thee" + 0.005*"one" + 0.005*"man" + 0.005*"thing"
Topic #4: 0.008*"said" + 0.006*"shall" + 0.006*"one" + 0.006*"man" + 0.005*"say" + 0.005*"thing" + 0.004*"thou" + 0.004*"would" + 0.004*"lord" + 0.004*"upon"
Topic #5: 0.010*"shall" + 0.009*"said" + 0.008*"thou" + 0.007*"one" + 0.006*"man" + 0.006*"thi" + 0.006*"lord" + 0.005*"god" + 0.005*"upon" + 0.005*"thee"
Topic #6: 0.013*"shall" + 0.012*"unto" + 0.008*"said" +

In [13]:
text = "Emma is rich and beautiful"
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])

 
print(lda_model[bow])
 

[(0, 0.013056957753757379), (1, 0.0994094120198734), (2, 0.14846075268578937), (3, 0.22478003402701674), (4, -0.00028436995679137836), (5, -0.3273406658518878), (6, -0.042856612249100076), (7, 0.10299981018667717), (8, -0.1277115268464437), (9, 0.04509400138211044)]
[(0, 0.025012754), (1, 0.02501381), (2, 0.025012048), (3, 0.025012705), (4, 0.025012277), (5, 0.025013205), (6, 0.025011422), (7, 0.025013603), (8, 0.7748856), (9, 0.025012609)]
