In [2]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(100)
import nltk
nltk.download('wordnet')
pip install pyLDAvis
import pyLDAvis.gensim

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Load the data

In [3]:
df= pd.read_csv('/content/drive/My Drive/Projects/abcnews/abcnews-date-text.csv',error_bad_lines=False)

In [4]:
###First few rows 
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
##Total length of the data
print(len(df))

1186018


# **Data Preprocessing**

In [6]:
##check how well the stemming works
original_words= ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
stemmer= SnowballStemmer('english')
singles= [stemmer.stem(plural) for plural in original_words]
#print(singles)
pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


# Implementing Lemmatizer and Trying Example

In [7]:
lem=[stemmer.stem(WordNetLemmatizer().lemmatize(plural)) for plural in original_words]

In [8]:
def lemmi(text):
  return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))


def preprocess(text):
  result=[]
  for token in gensim.utils.simple_preprocess(text):
    if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
      result.append(lemmi(token))
  return result

In [9]:
doc= df[df.index == 1110].values[0][1]

words= []
for word in doc.split(' '):
  words.append(word)
print('Original Document:- {}'.format(words))


print('\n\n Tokenized and Lemmi sample:- {}'.format(preprocess(doc)))

Original Document:- ['police', 'concerned', 'over', 'acid', 'theft']


 Tokenized and Lemmi sample:- ['polic', 'concern', 'acid', 'theft']


In [10]:
processed_docs = df['headline_text'].map(preprocess)

In [11]:
##checking the preprocessed data
processed_docs[:20]

0              [decid, communiti, broadcast, licenc]
1                                 [wit, awar, defam]
2             [call, infrastructur, protect, summit]
3                        [staff, aust, strike, rise]
4               [strike, affect, australian, travel]
5                 [ambiti, olsson, win, tripl, jump]
6             [antic, delight, record, break, barca]
7      [aussi, qualifi, stosur, wast, memphi, match]
8              [aust, address, secur, council, iraq]
9                           [australia, lock, timet]
10             [australia, contribut, million, iraq]
11         [barca, record, robson, celebr, birthday]
12                           [bathhous, plan, ahead]
13             [hop, launceston, cycl, championship]
14               [plan, boost, paroo, water, suppli]
15               [blizzard, buri, unit, state, bill]
16         [brigadi, dismiss, report, troop, harass]
17    [british, combat, troop, arriv, daili, kuwait]
18             [bryant, lead, laker, doubl, ov

In [12]:
dic=gensim.corpora.Dictionary(processed_docs)

In [13]:
count= 0
for k,v in dic.iteritems():
  print(k,v)
  count=+1
  if count > 10:
    break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
62119 strugglin
62120 dallym
62121 jaun
62122 reasess
62123 nascondino
62124 ailsa
62125 neccessari
62126 pun
62127 grannd
62128 anastasio
62129 dimtsi
62130 powi
62131 ozpod
62132 craigburn
62133 elzein
62134 fatma
62135 halev
62136 keshira
62137 lprakash
62138 vermouth
62139 sateki
62140 antoin
62141 tamestit
62142 marki
62143 transloc
62144 elctric
62145 bajo
62146 wildner
62147 erat
62148 melbournecbd
62149 priestess
62150 kurtsystem
62151 simspon
62152 kusher
62153 brute
62154 vincec
62155 aldean
62156 gtassal
62157 keeter
62158 walkom
62159 gioventu
62160 bibek
62161 guragain
62162 camphor
62163 poochibald
62164 viequ
62165 rufino
62166 danley
62167 marilou
62168 middleback
62169 polosak
62170 dpti
62171 extractor
62172 outhous
62173 pooseum
62174 bodysurf
62175 cillier
62176 ishiguro
62177 kazuo
62178 smaka
62179 macumba
62180 sall
62181 yilmaz
62182 andado
62183 womsat
62184 saffioti
62185 crossthwait
62186 hyperm

In [14]:
# TODO: apply dictionary.filter_extremes() with the parameters mentioned above
dic.filter_extremes(no_below=10, no_above=0.1)

In [15]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dic.doc2bow(doc) for doc in processed_docs]

In [16]:
bow_corpus[:20]

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1)],
 [(18, 1), (19, 1), (20, 1), (21, 1)],
 [(22, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)],
 [(11, 1), (33, 1), (34, 1), (35, 1), (36, 1)],
 [(37, 1), (38, 1), (39, 1)],
 [(35, 1), (37, 1), (40, 1), (41, 1)],
 [(23, 1), (26, 1), (42, 1), (43, 1), (44, 1)],
 [(45, 1), (46, 1), (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1)],
 [(47, 1), (52, 1), (53, 1), (54, 1), (55, 1)],
 [(56, 1), (57, 1), (58, 1), (59, 1), (60, 1)],
 [(61, 1), (62, 1), (63, 1), (64, 1), (65, 1)],
 [(65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1)],
 [(71, 1), (72, 1), (73, 1), (74, 1), (75, 1)],
 [(76, 1), (77, 1), (78, 1), (79, 1)]]

## **Tf-IDF on our corpus** 

In [17]:
'''
Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
'''
from gensim import corpora, models
#tfidf = # TODO
tfidf = models.TfidfModel(bow_corpus)

In [18]:
'''
Apply transformation to the entire corpus and call it 'corpus_tfidf'
'''
corpus_tfidf= tfidf[bow_corpus]

In [19]:
'''
Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
'''
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]


## **LDA using Bag of Words**

In [20]:
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=10, 
                                       id2word = dic, 
                                       passes = 2, 
                                       workers=2)

In [21]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.023*"live" + 0.021*"protest" + 0.017*"north" + 0.015*"break" + 0.015*"guilti" + 0.014*"student" + 0.014*"talk" + 0.013*"polit" + 0.012*"turnbul" + 0.011*"say"


Topic: 1 
Words: 0.027*"world" + 0.020*"donald" + 0.015*"final" + 0.015*"open" + 0.014*"royal" + 0.012*"women" + 0.010*"beat" + 0.009*"adelaid" + 0.009*"scott" + 0.009*"forc"


Topic: 2 
Words: 0.016*"report" + 0.013*"hospit" + 0.012*"fund" + 0.012*"child" + 0.012*"say" + 0.010*"health" + 0.010*"servic" + 0.010*"drum" + 0.010*"releas" + 0.010*"minist"


Topic: 3 
Words: 0.058*"australia" + 0.022*"market" + 0.020*"home" + 0.017*"china" + 0.016*"tasmania" + 0.013*"countri" + 0.012*"year" + 0.012*"street" + 0.011*"deal" + 0.010*"die"


Topic: 4 
Words: 0.045*"trump" + 0.023*"crash" + 0.022*"death" + 0.017*"chang" + 0.017*"nation" + 0.016*"brisban" + 0.014*"investig" + 0.013*"interview" + 0.011*"concern" + 0.011*"climat"


Topic: 5 
Words: 0.016*"warn" + 0.015*"south" + 0.014*"water" + 0.014*"canberra" + 0.013*"r

# Running LDA using TD-IDF

In [22]:
'''
Define lda model using corpus_tfidf
'''
# TODO
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=5, 
                                             id2word = dic, 
                                             passes = 2, 
                                             workers=4)

In [23]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.009*"news" + 0.009*"market" + 0.007*"australia" + 0.007*"interview" + 0.006*"rural" + 0.006*"world" + 0.005*"australian" + 0.005*"final" + 0.004*"nation" + 0.004*"weather"


Topic: 1 Word: 0.011*"polic" + 0.011*"charg" + 0.009*"murder" + 0.008*"crash" + 0.008*"death" + 0.008*"woman" + 0.008*"court" + 0.006*"alleg" + 0.006*"jail" + 0.005*"investig"


Topic: 2 Word: 0.007*"govern" + 0.005*"donald" + 0.005*"chang" + 0.005*"health" + 0.005*"plan" + 0.004*"council" + 0.004*"fund" + 0.004*"say" + 0.004*"water" + 0.004*"feder"


Topic: 3 Word: 0.009*"countri" + 0.007*"hour" + 0.004*"say" + 0.004*"australia" + 0.004*"north" + 0.004*"wednesday" + 0.004*"kill" + 0.003*"care" + 0.003*"korea" + 0.003*"china"


Topic: 4 Word: 0.014*"trump" + 0.007*"drum" + 0.005*"coast" + 0.005*"gold" + 0.004*"wall" + 0.004*"street" + 0.004*"drive" + 0.003*"australia" + 0.003*"elect" + 0.003*"novemb"




In [24]:
processed_docs[101]

['high', 'educ', 'live']

In [25]:
document_num=101
# Our test document is document number 4310
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.2750000059604645	 
Topic: 0.023*"live" + 0.021*"protest" + 0.017*"north" + 0.015*"break" + 0.015*"guilti" + 0.014*"student" + 0.014*"talk" + 0.013*"polit" + 0.012*"turnbul" + 0.011*"say"

Score: 0.2750000059604645	 
Topic: 0.045*"trump" + 0.023*"crash" + 0.022*"death" + 0.017*"chang" + 0.017*"nation" + 0.016*"brisban" + 0.014*"investig" + 0.013*"interview" + 0.011*"concern" + 0.011*"climat"

Score: 0.2750000059604645	 
Topic: 0.016*"warn" + 0.015*"south" + 0.014*"water" + 0.014*"canberra" + 0.013*"rural" + 0.012*"bushfir" + 0.012*"rise" + 0.012*"high" + 0.010*"west" + 0.009*"stori"

Score: 0.02500000037252903	 
Topic: 0.027*"world" + 0.020*"donald" + 0.015*"final" + 0.015*"open" + 0.014*"royal" + 0.012*"women" + 0.010*"beat" + 0.009*"adelaid" + 0.009*"scott" + 0.009*"forc"

Score: 0.02500000037252903	 
Topic: 0.016*"report" + 0.013*"hospit" + 0.012*"fund" + 0.012*"child" + 0.012*"say" + 0.010*"health" + 0.010*"servic" + 0.010*"drum" + 0.010*"releas" + 0.010*"minist"

Score: 0

# Testing model on unseen document

In [36]:
unseen_document = "My favorite sports activities are running and swimming."
bow_vector = dic.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.2199999988079071	 Topic: 0.016*"report" + 0.013*"hospit" + 0.012*"fund" + 0.012*"child" + 0.012*"say"
Score: 0.2199999988079071	 Topic: 0.016*"warn" + 0.015*"south" + 0.014*"water" + 0.014*"canberra" + 0.013*"rural"
Score: 0.2199999988079071	 Topic: 0.053*"australian" + 0.021*"famili" + 0.018*"accus" + 0.014*"claim" + 0.014*"sentenc"
Score: 0.2199999839067459	 Topic: 0.019*"shoot" + 0.015*"island" + 0.015*"tasmanian" + 0.014*"coast" + 0.012*"gold"
Score: 0.019999999552965164	 Topic: 0.023*"live" + 0.021*"protest" + 0.017*"north" + 0.015*"break" + 0.015*"guilti"
Score: 0.019999999552965164	 Topic: 0.027*"world" + 0.020*"donald" + 0.015*"final" + 0.015*"open" + 0.014*"royal"
Score: 0.019999999552965164	 Topic: 0.058*"australia" + 0.022*"market" + 0.020*"home" + 0.017*"china" + 0.016*"tasmania"
Score: 0.019999999552965164	 Topic: 0.045*"trump" + 0.023*"crash" + 0.022*"death" + 0.017*"chang" + 0.017*"nation"
Score: 0.019999999552965164	 Topic: 0.043*"polic" + 0.027*"sydney" + 0.02

# Visualizing First 5 results of EDA

In [31]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda_model_tfidf,corpus_tfidf,dic, mds='tsne')
panel