In [1]:
import pandas as pd

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
len(documents)


158224

Data Preprocessing¶


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)



In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saigiriraj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
print(WordNetLemmatizer().lemmatize('thought', pos='v'))


think


In [6]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [7]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [9]:
processed_docs = documents['headline_text'].map(preprocess)


In [10]:
processed_docs[:10]


0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

Bag of words on the dataset¶


In [11]:
dictionary = gensim.corpora.Dictionary(processed_docs)


In [12]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 25:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit
11 aust
12 rise
13 staff
14 strike
15 affect
16 australian
17 travel
18 ambiti
19 jump
20 olsson
21 tripl
22 win
23 antic
24 barca
25 break


In [13]:
dictionary.filter_extremes(no_below=15, no_above=1, keep_n=100000)


In [14]:

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(69, 1), (104, 1), (449, 1), (3153, 1)]

In [15]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 69 ("bushfir") appears 1 time.
Word 104 ("help") appears 1 time.
Word 449 ("rain") appears 1 time.
Word 3153 ("dampen") appears 1 time.


TF-IDF

In [16]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [17]:
corpus_tfidf = tfidf[bow_corpus]

In [18]:

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5718774623442047),
 (1, 0.40111507607293295),
 (2, 0.4924062908021152),
 (3, 0.5192291484385875)]


Running LDA using Bag of Words¶


In [19]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)


In [20]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.078*"polic" + 0.033*"charg" + 0.028*"court" + 0.027*"face" + 0.020*"probe" + 0.019*"death" + 0.018*"murder" + 0.018*"investig" + 0.016*"jail" + 0.013*"drug"
Topic: 1 
Words: 0.036*"council" + 0.029*"iraq" + 0.027*"plan" + 0.016*"warn" + 0.012*"govt" + 0.011*"leader" + 0.011*"develop" + 0.010*"resid" + 0.010*"chang" + 0.010*"say"
Topic: 2 
Words: 0.027*"crash" + 0.025*"continu" + 0.017*"final" + 0.017*"strike" + 0.014*"train" + 0.013*"market" + 0.013*"aussi" + 0.013*"search" + 0.013*"water" + 0.012*"forc"
Topic: 3 
Words: 0.030*"boost" + 0.021*"test" + 0.019*"servic" + 0.015*"tsunami" + 0.015*"health" + 0.012*"break" + 0.011*"north" + 0.010*"look" + 0.009*"govt" + 0.009*"fund"
Topic: 4 
Words: 0.015*"lead" + 0.012*"south" + 0.012*"award" + 0.011*"england" + 0.011*"india" + 0.011*"effort" + 0.010*"honour" + 0.010*"move" + 0.008*"indonesia" + 0.008*"fire"
Topic: 5 
Words: 0.048*"kill" + 0.029*"attack" + 0.023*"rise" + 0.015*"bomb" + 0.013*"iraqi" + 0.012*"cost" + 0.012*

In [21]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [22]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.013*"fund" + 0.013*"govt" + 0.009*"boost" + 0.008*"council" + 0.008*"plan" + 0.008*"health" + 0.008*"servic" + 0.008*"region" + 0.008*"urg" + 0.007*"group"
Topic: 1 Word: 0.008*"kill" + 0.007*"soldier" + 0.007*"polic" + 0.006*"doubl" + 0.006*"driver" + 0.006*"speed" + 0.005*"warrior" + 0.005*"chopper" + 0.004*"iraq" + 0.004*"beazley"
Topic: 2 Word: 0.016*"miss" + 0.014*"search" + 0.008*"polic" + 0.006*"doubt" + 0.006*"aceh" + 0.006*"indonesia" + 0.006*"toll" + 0.006*"death" + 0.005*"continu" + 0.005*"export"
Topic: 3 Word: 0.007*"england" + 0.007*"world" + 0.006*"test" + 0.006*"australia" + 0.005*"break" + 0.005*"pakistan" + 0.005*"hostag" + 0.005*"hill" + 0.005*"chelsea" + 0.004*"team"
Topic: 4 Word: 0.014*"crash" + 0.012*"polic" + 0.008*"charg" + 0.008*"die" + 0.008*"fatal" + 0.008*"stab" + 0.007*"murder" + 0.007*"investig" + 0.006*"woman" + 0.006*"bail"
Topic: 5 Word: 0.008*"bodi" + 0.008*"blaze" + 0.007*"child" + 0.007*"abus" + 0.007*"court" + 0.006*"jail" + 0.006*

Classification of the topics
Performance evaluation by classifying sample document using LDA Bag of Words model

In [23]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.30225178599357605	 
Topic: 0.027*"govt" + 0.021*"minist" + 0.020*"urg" + 0.013*"chief" + 0.013*"rule" + 0.012*"target" + 0.010*"tour" + 0.009*"busi" + 0.009*"mayor" + 0.009*"prompt"

Score: 0.28869834542274475	 
Topic: 0.030*"boost" + 0.021*"test" + 0.019*"servic" + 0.015*"tsunami" + 0.015*"health" + 0.012*"break" + 0.011*"north" + 0.010*"look" + 0.009*"govt" + 0.009*"fund"

Score: 0.26903730630874634	 
Topic: 0.078*"polic" + 0.033*"charg" + 0.028*"court" + 0.027*"face" + 0.020*"probe" + 0.019*"death" + 0.018*"murder" + 0.018*"investig" + 0.016*"jail" + 0.013*"drug"

Score: 0.020004281774163246	 
Topic: 0.027*"crash" + 0.025*"continu" + 0.017*"final" + 0.017*"strike" + 0.014*"train" + 0.013*"market" + 0.013*"aussi" + 0.013*"search" + 0.013*"water" + 0.012*"forc"

Score: 0.0200033001601696	 
Topic: 0.021*"fund" + 0.014*"indigen" + 0.012*"gold" + 0.012*"abus" + 0.011*"child" + 0.011*"get" + 0.011*"communiti" + 0.011*"govt" + 0.011*"council" + 0.011*"clear"

Score: 0.02000203169

Performance evaluation by classifying sample document using LDA TF-IDF model¶


In [24]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5611538887023926	 
Topic: 0.017*"tsunami" + 0.008*"shoot" + 0.005*"dead" + 0.005*"warn" + 0.005*"death" + 0.005*"wind" + 0.005*"bird" + 0.005*"troop" + 0.005*"bushfir" + 0.005*"iraq"

Score: 0.2788211405277252	 
Topic: 0.008*"plan" + 0.007*"centr" + 0.006*"council" + 0.006*"govt" + 0.006*"market" + 0.006*"care" + 0.006*"green" + 0.006*"latham" + 0.005*"issu" + 0.005*"teacher"

Score: 0.02000528760254383	 
Topic: 0.014*"crash" + 0.012*"polic" + 0.008*"charg" + 0.008*"die" + 0.008*"fatal" + 0.008*"stab" + 0.007*"murder" + 0.007*"investig" + 0.006*"woman" + 0.006*"bail"

Score: 0.02000509202480316	 
Topic: 0.013*"fund" + 0.013*"govt" + 0.009*"boost" + 0.008*"council" + 0.008*"plan" + 0.008*"health" + 0.008*"servic" + 0.008*"region" + 0.008*"urg" + 0.007*"group"

Score: 0.020004358142614365	 
Topic: 0.008*"bodi" + 0.008*"blaze" + 0.007*"child" + 0.007*"abus" + 0.007*"court" + 0.006*"jail" + 0.006*"firefight" + 0.006*"polic" + 0.006*"appeal" + 0.005*"sentenc"

Score: 0.02000324428

In [25]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.40938445925712585	 Topic: 0.036*"council" + 0.029*"iraq" + 0.027*"plan" + 0.016*"warn" + 0.012*"govt"
Score: 0.2773055136203766	 Topic: 0.027*"crash" + 0.025*"continu" + 0.017*"final" + 0.017*"strike" + 0.014*"train"
Score: 0.19663643836975098	 Topic: 0.078*"polic" + 0.033*"charg" + 0.028*"court" + 0.027*"face" + 0.020*"probe"
Score: 0.016668904572725296	 Topic: 0.020*"australia" + 0.020*"world" + 0.017*"record" + 0.017*"miss" + 0.014*"play"
Score: 0.016667885705828667	 Topic: 0.015*"lead" + 0.012*"south" + 0.012*"award" + 0.011*"england" + 0.011*"india"
Score: 0.016667800024151802	 Topic: 0.033*"claim" + 0.026*"hospit" + 0.018*"return" + 0.014*"fear" + 0.013*"elect"
Score: 0.016667598858475685	 Topic: 0.030*"boost" + 0.021*"test" + 0.019*"servic" + 0.015*"tsunami" + 0.015*"health"
Score: 0.016667373478412628	 Topic: 0.021*"fund" + 0.014*"indigen" + 0.012*"gold" + 0.012*"abus" + 0.011*"child"
Score: 0.016667338088154793	 Topic: 0.048*"kill" + 0.029*"attack" + 0.023*"rise" + 0.