In [0]:
pip install wikipedia

In [0]:
pip install pyLDAvis

In [101]:
pip install wikipedia-api



Now we will download four Wikipedia articles on the topics "Global Warming", "Artifical Intelligence", "Eiffel Tower", and "Mona Lisa". Next, we will preprocess the articles, followed by the topic modeling step. Finally, we will see how we can visualize the LDA model.

In [102]:
import wikipedia
import nltk

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
nltk.download('wordnet')
  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
artificial_intelligence = wikipedia.page("Artificial Intelligence")
mona_lisa = wikipedia.page("Mona Lisa")
eiffel_tower = wikipedia.page("Eiffel Tower")

In [0]:
corpus = [artificial_intelligence.content, mona_lisa.content, eiffel_tower.content]

In [0]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens

In [0]:
processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

In [0]:
from gensim import corpora

gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]

In [108]:
import pickle

pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [109]:
import pickle

pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [110]:
import gensim

lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [111]:
topics = lda_model.print_topics(num_words=50)
for topic in topics:
    print(topic)

(0, '0.017*"intelligence" + 0.014*"machine" + 0.013*"artificial" + 0.011*"problem" + 0.010*"learning" + 0.009*"system" + 0.008*"network" + 0.007*"research" + 0.007*"knowledge" + 0.007*"computer" + 0.006*"algorithm" + 0.006*"neural" + 0.005*"example" + 0.005*"intelligent" + 0.005*"approach" + 0.005*"researcher" + 0.005*"theory" + 0.004*"symbolic" + 0.004*"application" + 0.004*"search" + 0.003*"include" + 0.003*"general" + 0.003*"decision" + 0.003*"language" + 0.003*"reasoning" + 0.003*"technology" + 0.003*"possible" + 0.003*"people" + 0.003*"function" + 0.003*"process" + 0.002*"behavior" + 0.002*"information" + 0.002*"natural" + 0.002*"program" + 0.002*"different" + 0.002*"classifier" + 0.002*"driving" + 0.002*"number" + 0.002*"method" + 0.002*"solution" + 0.002*"processing" + 0.002*"action" + 0.002*"patient" + 0.002*"autonomous" + 0.002*"recognition" + 0.002*"others" + 0.002*"development" + 0.002*"science" + 0.002*"project" + 0.002*"question"')
(1, '0.027*"eiffel" + 0.008*"second" + 0.

In [112]:
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=3, id2word=gensim_dictionary, passes=15)
lda_model.save('gensim_model.gensim')
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.034*"painting" + 0.016*"leonardo" + 0.009*"portrait" + 0.009*"louvre" + 0.006*"century"')
(1, '0.016*"intelligence" + 0.014*"machine" + 0.012*"artificial" + 0.011*"problem" + 0.010*"learning"')
(2, '0.025*"eiffel" + 0.007*"second" + 0.006*"french" + 0.006*"structure" + 0.006*"exposition"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [113]:
test_doc = ' eiffel structures are build to remember an event happened in the history.'
test_doc = preprocess_text(test_doc)
bow_test_doc = gensim_dictionary.doc2bow(test_doc)

print(lda_model.get_document_topics(bow_test_doc))

[(0, 0.090093866), (1, 0.08464003), (2, 0.8252661)]


In [114]:
print('\nPerplexity:', lda_model.log_perplexity(gensim_corpus))

from gensim.models import CoherenceModel

coherence_score_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nCoherence Score:', coherence_score)


Perplexity: -7.587623205998928

Coherence Score: 0.6545631673869586


In [115]:
gensim_dictionary = gensim.corpora.Dictionary.load('gensim_dictionary.gensim')
gensim_corpus = pickle.load(open('gensim_corpus_corpus.pkl', 'rb'))
lda_model = gensim.models.ldamodel.LdaModel.load('gensim_model.gensim')

import pyLDAvis.gensim

lda_visualization = pyLDAvis.gensim.prepare(lda_model, gensim_corpus, gensim_dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
