Ref source: Topic Modeling and Latent Dirichlet Allocation (LDA) in Python by Susan Li published in Towards Data Science 2018

In [2]:
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv',  on_bad_lines='skip');
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
### Note: argument 'error_bad_lines=False' has been deprecated

In [3]:
### Take a look at data/abc news headlines
print(len(documents))
print(documents[:5])

1244184
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [4]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
data.tail()

Unnamed: 0,publish_date,headline_text
1244179,20211231,two aged care residents die as state records 2...
1244180,20211231,victoria records 5;919 new cases and seven deaths
1244181,20211231,wa delays adopting new close contact definition
1244182,20211231,western ringtail possums found badly dehydrate...
1244183,20211231,what makes you a close covid contact here are ...


### Text data pre-processing involving the following steps
Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
Words that have fewer than 3 characters are removed.
All stopwords are removed.
Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
Words are stemmed — words are reduced to their root form.

In [7]:
pip install PyStemmer




In [8]:
### Load gensim and nltk libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem import *
from nltk.stem.porter import *
stemmer = PorterStemmer()
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vista\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
### Write a function to perform lemmatize and stem preprocessing steps on the data set.
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
### Select a document to preview after preprocessing.
doc_sample = documents[documents['index'] == 8972].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rail', 'tunnel', 'plan', 'may', 'be', 'back', 'on', 'state', 'agenda']


 tokenized and lemmatized document: 
['rail', 'tunnel', 'plan', 'state', 'agenda']


In [11]:
### This is how the above looks like in the original headline text
print(documents[8970:8975])

                                        headline_text  index
8970         raa predicts drop in country fuel prices   8970
8971  rail brochure scrapped over townsville omission   8971
8972     rail tunnel plan may be back on state agenda   8972
8973            rain leads to canning river fish kill   8973
8974  raising teenagers could be a poverty trap acoss   8974


In [12]:
### Preprocess the headline text, saving the results as ‘processed_docs’
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [13]:
len(processed_docs)

1244184

# Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 50:
        break

0 broadcast
1 commun
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit
11 aust
12 rise
13 staff
14 strike
15 affect
16 australian
17 travel
18 ambiti
19 jump
20 olsson
21 tripl
22 win
23 antic
24 barca
25 break
26 delight
27 record
28 aussi
29 match
30 memphi
31 qualifi
32 stosur
33 wast
34 address
35 council
36 iraq
37 secur
38 australia
39 lock
40 timet
41 contribut
42 million
43 birthday
44 celebr
45 robson
46 ahead
47 bathhous
48 plan
49 championship
50 cycl


In [16]:
print(dictionary[70292])
len(dictionary)

usaustralia


70293

Gensim filter_extremes

Filter out tokens that appear in less than 15 documents (absolute number) i.e. rare words
or more than 0.5 documents (fraction of total corpus size), i.e. overly common words.
after the above two steps, keep only the first 100000 most frequent tokens.
### dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [18]:
dictionary.filter_extremes(no_below=1, no_above=0.99, keep_n=100000)

In [19]:
len(dictionary)

70293

Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [21]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[8972]

[(48, 1), (60, 1), (923, 1), (1468, 1), (2827, 1)]

# Preview Bag Of Words for our sample preprocessed document.

In [23]:
bow_doc_8972 = bow_corpus[8972]
for i in range(len(bow_doc_8972)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_8972[i][0], 
                                               dictionary[bow_doc_8972[i][0]], 
bow_doc_8972[i][1]))

Word 48 ("plan") appears 1 time.
Word 60 ("state") appears 1 time.
Word 923 ("tunnel") appears 1 time.
Word 1468 ("rail") appears 1 time.
Word 2827 ("agenda") appears 1 time.


In [24]:
len(bow_corpus)

1244184

In [25]:
print(bow_corpus[0])
print(bow_corpus[1])
print(bow_corpus[2])
print(bow_corpus[3])
print(bow_corpus[4])
print(bow_corpus[5])

[(0, 1), (1, 1), (2, 1), (3, 1)]
[(4, 1), (5, 1), (6, 1)]
[(7, 1), (8, 1), (9, 1), (10, 1)]
[(11, 1), (12, 1), (13, 1), (14, 1)]
[(14, 1), (15, 1), (16, 1), (17, 1)]
[(18, 1), (19, 1), (20, 1), (21, 1), (22, 1)]


# TF-IDF
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [27]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint

count = 0
for doc in corpus_tfidf:
    pprint(doc)
    count += 1
    if count > 5:
        break

[(0, 0.5854395661274623),
 (1, 0.383252758688686),
 (2, 0.50230806644029),
 (3, 0.5080004367704987)]
[(4, 0.6001950088242418), (5, 0.6146521710780535), (6, 0.5118287408611435)]
[(7, 0.3823222189653971),
 (8, 0.5648602809024891),
 (9, 0.47289714459609433),
 (10, 0.5577910671362503)]
[(11, 0.5358712461682118),
 (12, 0.43512898451094045),
 (13, 0.53611420657169),
 (14, 0.485887159616935)]
[(14, 0.47235494692269364),
 (15, 0.5715156253616932),
 (16, 0.38223210042906264),
 (17, 0.5514973395100641)]
[(18, 0.4989690427684149),
 (19, 0.35526178458502394),
 (20, 0.6399442306983866),
 (21, 0.38583734587630736),
 (22, 0.2577205519457323)]


In [28]:
len(corpus_tfidf)

1244184

# Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [30]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [31]:
### For each topic, we will explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.036*"trump" + 0.032*"sydney" + 0.024*"melbourn" + 0.023*"charg" + 0.023*"court" + 0.023*"china" + 0.020*"donald" + 0.019*"murder" + 0.017*"face" + 0.016*"woman"
Topic: 1 
Words: 0.056*"covid" + 0.044*"australian" + 0.031*"coronaviru" + 0.022*"vaccin" + 0.018*"open" + 0.016*"case" + 0.015*"australia" + 0.015*"world" + 0.010*"win" + 0.009*"break"
Topic: 2 
Words: 0.039*"victoria" + 0.021*"warn" + 0.020*"adelaid" + 0.016*"final" + 0.013*"travel" + 0.013*"andrew" + 0.012*"street" + 0.012*"coronaviru" + 0.011*"time" + 0.011*"hotel"
Topic: 3 
Words: 0.028*"australia" + 0.028*"south" + 0.018*"north" + 0.017*"victorian" + 0.017*"test" + 0.016*"miss" + 0.015*"coronaviru" + 0.014*"west" + 0.013*"lose" + 0.011*"search"
Topic: 4 
Words: 0.030*"year" + 0.021*"women" + 0.021*"record" + 0.018*"border" + 0.015*"speak" + 0.014*"life" + 0.013*"australia" + 0.012*"sentenc" + 0.012*"farm" + 0.012*"abus"
Topic: 5 
Words: 0.029*"kill" + 0.016*"dead" + 0.014*"protest" + 0.014*"presid" + 0.

Can you distinguish different topics using the words in each topic and their corresponding weights?

In [33]:
# Print the topics
topics = lda_model.print_topics(num_words=30)  # You can specify the number of words per topic
for topic in topics:
    print(topic)

(0, '0.036*"trump" + 0.032*"sydney" + 0.024*"melbourn" + 0.023*"charg" + 0.023*"court" + 0.023*"china" + 0.020*"donald" + 0.019*"murder" + 0.017*"face" + 0.016*"woman" + 0.015*"peopl" + 0.015*"restrict" + 0.015*"brisban" + 0.013*"trial" + 0.012*"case" + 0.012*"accus" + 0.010*"care" + 0.010*"flood" + 0.009*"tell" + 0.009*"alleg" + 0.009*"age" + 0.009*"drug" + 0.009*"guilti" + 0.008*"hear" + 0.008*"storm" + 0.008*"home" + 0.007*"arrest" + 0.006*"intern" + 0.006*"bring" + 0.006*"hobart"')
(1, '0.056*"covid" + 0.044*"australian" + 0.031*"coronaviru" + 0.022*"vaccin" + 0.018*"open" + 0.016*"case" + 0.015*"australia" + 0.015*"world" + 0.010*"win" + 0.009*"break" + 0.008*"game" + 0.007*"updat" + 0.007*"health" + 0.007*"test" + 0.007*"race" + 0.007*"mental" + 0.006*"river" + 0.006*"stori" + 0.006*"aborigin" + 0.006*"free" + 0.006*"sport" + 0.006*"unit" + 0.006*"peter" + 0.006*"minist" + 0.005*"action" + 0.005*"play" + 0.005*"hill" + 0.005*"olymp" + 0.005*"team" + 0.005*"histori"')
(2, '0.039*"

# Running LDA using TF-IDF

In [35]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.011*"interview" + 0.010*"scott" + 0.008*"andrew" + 0.008*"weather" + 0.006*"queensland" + 0.006*"search" + 0.006*"david" + 0.006*"dollar" + 0.006*"australia" + 0.006*"juli"
Topic: 1 Word: 0.015*"drum" + 0.010*"stori" + 0.010*"friday" + 0.009*"tuesday" + 0.009*"sport" + 0.008*"michael" + 0.008*"korea" + 0.007*"flight" + 0.006*"daniel" + 0.005*"wild"
Topic: 2 Word: 0.017*"covid" + 0.015*"coronaviru" + 0.011*"restrict" + 0.008*"case" + 0.007*"updat" + 0.007*"record" + 0.006*"govern" + 0.006*"victoria" + 0.005*"australia" + 0.005*"august"
Topic: 3 Word: 0.011*"australia" + 0.009*"govern" + 0.007*"age" + 0.006*"world" + 0.006*"australian" + 0.006*"peter" + 0.006*"cricket" + 0.006*"win" + 0.005*"alan" + 0.005*"celebr"
Topic: 4 Word: 0.031*"trump" + 0.018*"donald" + 0.012*"lockdown" + 0.012*"coronaviru" + 0.007*"turnbul" + 0.007*"covid" + 0.006*"disabl" + 0.006*"footag" + 0.005*"economi" + 0.005*"onlin"
Topic: 5 Word: 0.016*"charg" + 0.016*"murder" + 0.015*"polic" + 0.012*"co

Again, can you distinguish different topics using the words in each topic and their corresponding weights?

# Performance evaluation by classifying sample document using LDA Bag of Words model
We will check where (to which topic) our test document would be classified.

In [38]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [39]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5960099101066589	 
Topic: 0.026*"govern" + 0.019*"chang" + 0.018*"news" + 0.017*"nation" + 0.015*"bushfir" + 0.015*"plan" + 0.014*"tasmania" + 0.013*"school" + 0.012*"commun" + 0.011*"island"

Score: 0.30394983291625977	 
Topic: 0.030*"elect" + 0.015*"lockdown" + 0.013*"coast" + 0.012*"gold" + 0.012*"labor" + 0.011*"power" + 0.011*"tasmanian" + 0.011*"million" + 0.011*"countri" + 0.009*"polit"

Score: 0.012505189515650272	 
Topic: 0.056*"covid" + 0.044*"australian" + 0.031*"coronaviru" + 0.022*"vaccin" + 0.018*"open" + 0.016*"case" + 0.015*"australia" + 0.015*"world" + 0.010*"win" + 0.009*"break"

Score: 0.01250517088919878	 
Topic: 0.028*"australia" + 0.028*"south" + 0.018*"north" + 0.017*"victorian" + 0.017*"test" + 0.016*"miss" + 0.015*"coronaviru" + 0.014*"west" + 0.013*"lose" + 0.011*"search"

Score: 0.012505164369940758	 
Topic: 0.030*"year" + 0.021*"women" + 0.021*"record" + 0.018*"border" + 0.015*"speak" + 0.014*"life" + 0.013*"australia" + 0.012*"sentenc" + 0.012*"fa

In [40]:
### Our test document has the highest probability to be part of the topic that our model assigned, which is the accurate classification.

# Performance evaluation by classifying sample document using LDA TF-IDF model.

In [42]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6002472043037415	 
Topic: 0.011*"countri" + 0.010*"health" + 0.008*"hour" + 0.008*"chang" + 0.007*"climat" + 0.007*"budget" + 0.006*"fund" + 0.006*"feder" + 0.006*"coronaviru" + 0.006*"elect"

Score: 0.1645704060792923	 
Topic: 0.031*"trump" + 0.018*"donald" + 0.012*"lockdown" + 0.012*"coronaviru" + 0.007*"turnbul" + 0.007*"covid" + 0.006*"disabl" + 0.006*"footag" + 0.005*"economi" + 0.005*"onlin"

Score: 0.14761237800121307	 
Topic: 0.016*"charg" + 0.016*"murder" + 0.015*"polic" + 0.012*"court" + 0.010*"alleg" + 0.009*"woman" + 0.009*"child" + 0.009*"assault" + 0.009*"sentenc" + 0.008*"shoot"

Score: 0.012511433102190495	 
Topic: 0.017*"covid" + 0.015*"coronaviru" + 0.011*"restrict" + 0.008*"case" + 0.007*"updat" + 0.007*"record" + 0.006*"govern" + 0.006*"victoria" + 0.005*"australia" + 0.005*"august"

Score: 0.0125108752399683	 
Topic: 0.008*"presid" + 0.007*"violenc" + 0.007*"wednesday" + 0.007*"video" + 0.006*"elect" + 0.006*"biden" + 0.006*"liber" + 0.006*"say" + 0.005*"

In [43]:
### Our test document has the highest probability to be part of the topic that our model assigned, which is the accurate classification.

# Testing model on unseen document

In [45]:
unseen_document = 'China has some of the best technologies on earth'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.4200029671192169	 Topic: 0.036*"trump" + 0.032*"sydney" + 0.024*"melbourn" + 0.023*"charg" + 0.023*"court"
Score: 0.2199912816286087	 Topic: 0.056*"covid" + 0.044*"australian" + 0.031*"coronaviru" + 0.022*"vaccin" + 0.018*"open"
Score: 0.2199835628271103	 Topic: 0.030*"year" + 0.021*"women" + 0.021*"record" + 0.018*"border" + 0.015*"speak"
Score: 0.020003177225589752	 Topic: 0.039*"victoria" + 0.021*"warn" + 0.020*"adelaid" + 0.016*"final" + 0.013*"travel"
Score: 0.020003177225589752	 Topic: 0.028*"australia" + 0.028*"south" + 0.018*"north" + 0.017*"victorian" + 0.017*"test"
Score: 0.020003177225589752	 Topic: 0.029*"kill" + 0.016*"dead" + 0.014*"protest" + 0.014*"presid" + 0.013*"biden"
Score: 0.020003177225589752	 Topic: 0.030*"elect" + 0.015*"lockdown" + 0.013*"coast" + 0.012*"gold" + 0.012*"labor"
Score: 0.020003177225589752	 Topic: 0.062*"polic" + 0.033*"death" + 0.026*"live" + 0.023*"famili" + 0.022*"crash"
Score: 0.020003177225589752	 Topic: 0.026*"govern" + 0.019*"chan