In [73]:
import pandas as pd
import gensim
import re
import pyLDAvis.sklearn
import pyLDAvis.gensim
from gensim.models import LdaModel , LdaMulticore,LsiModel
from gensim import corpora
from gensim.utils import simple_preprocess, lemmatize
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [74]:
stop_words = stopwords.words('english')
stop_words = stop_words +['new','man','nsw','act','big','back','hope','call','vic','low','high','out','orkopolouse']

In [75]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);

In [76]:
data['headline_text']

0          aba decides against community broadcasting lic...
1             act fire witnesses must be aware of defamation
2             a g calls for infrastructure protection summit
3                   air nz staff in aust strike for pay rise
4              air nz strike to affect australian travellers
5                          ambitious olsson wins triple jump
6                 antic delighted with record breaking barca
7          aussie qualifier stosur wastes four memphis match
8               aust addresses un security council over iraq
9                 australia is locked into war timetable opp
10         australia to contribute 10 million in aid to iraq
11         barca take record as robson celebrates birthda...
12                                bathhouse plans move ahead
13             big hopes for launceston cycling championship
14                    big plan to boost paroo water supplies
15                    blizzard buries united states in bills
16            brigadier 

In [77]:
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [78]:
data_text['headline_text'][0:100]

0     aba decides against community broadcasting lic...
1        act fire witnesses must be aware of defamation
2        a g calls for infrastructure protection summit
3              air nz staff in aust strike for pay rise
4         air nz strike to affect australian travellers
5                     ambitious olsson wins triple jump
6            antic delighted with record breaking barca
7     aussie qualifier stosur wastes four memphis match
8          aust addresses un security council over iraq
9            australia is locked into war timetable opp
10    australia to contribute 10 million in aid to iraq
11    barca take record as robson celebrates birthda...
12                           bathhouse plans move ahead
13        big hopes for launceston cycling championship
14               big plan to boost paroo water supplies
15               blizzard buries united states in bills
16       brigadier dismisses reports troops harassed in
17       british combat troops arriving daily in

In [79]:
data_processed=[]
for c in range(0,800000):
    doc=data_text['headline_text'][c].split()
    doc_out=[]
    for x in doc:
        if x not in stop_words:
            lemma = lemmatize(x,allowed_tags=re.compile('(NN)'))
            if lemma:
                doc_out = doc_out + [lemma[0].split(b'/')[0].decode('utf-8')]
        else:
            continue
    data_processed.append(doc_out)


In [80]:
## making a dictionary
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]
# dct = corpora.Dictionary(data_processed1)
# corpus = [dct.doc2bow(line) for line in data_pr]

In [81]:
lda_model = LdaModel(corpus=corpus,id2word=dct,random_state=100,num_topics=7,passes=10,chunksize=1000,alpha='asymmetric',
                         decay=0.5,offset=64,eta=None,eval_every=0,iterations=100,gamma_threshold=0.001,per_word_topics=True)


In [82]:
lda_model.print_topics(-1)

[(0,
  '0.025*"interview" + 0.010*"report" + 0.010*"business" + 0.009*"cut" + 0.009*"budget" + 0.009*"polouse" + 0.009*"crash" + 0.008*"weather" + 0.008*"day" + 0.008*"funding"'),
 (1,
  '0.026*"market" + 0.020*"farmer" + 0.018*"water" + 0.015*"plan" + 0.015*"change" + 0.014*"council" + 0.013*"fear" + 0.011*"industry" + 0.011*"gillard" + 0.011*"share"'),
 (2,
  '0.031*"polouse" + 0.020*"fire" + 0.020*"hit" + 0.019*"flood" + 0.018*"price" + 0.017*"attack" + 0.014*"sport" + 0.013*"house" + 0.011*"probe" + 0.011*"west"'),
 (3,
  '0.037*"court" + 0.029*"woman" + 0.028*"death" + 0.022*"murder" + 0.020*"child" + 0.017*"rate" + 0.017*"trial" + 0.016*"case" + 0.016*"hill" + 0.013*"sex"'),
 (4,
  '0.031*"news" + 0.026*"health" + 0.025*"minister" + 0.024*"sydney" + 0.023*"drug" + 0.018*"labor" + 0.015*"fight" + 0.014*"top" + 0.012*"end" + 0.012*"housing"'),
 (5,
 (6,
  '0.032*"government" + 0.024*"set" + 0.016*"hunter" + 0.015*"grower" + 0.015*"world" + 0.015*"food" + 0.014*"abuse" + 0.013*"reco

In [None]:
for t in range(lda_model.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(lda_model.show_topic(t, 200))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

In [83]:
lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)

In [84]:
lsi_model.print_topics(-1)

[(0,
  '-0.959*"polouse" + -0.094*"interview" + -0.075*"death" + -0.068*"car" + -0.066*"woman" + -0.059*"crash" + -0.057*"officer" + -0.055*"search" + -0.052*"fire" + -0.050*"court"'),
 (1,
  '-0.989*"interview" + 0.092*"polouse" + -0.043*"michael" + -0.028*"belinda" + -0.028*"varischettus" + -0.024*"john" + -0.022*"scott" + -0.020*"ben" + -0.019*"matthew" + -0.017*"anthony"'),
 (2,
  '0.603*"market" + 0.445*"business" + 0.314*"news" + 0.272*"council" + 0.246*"analysis" + 0.212*"plan" + 0.150*"budget" + 0.131*"qld" + 0.118*"share" + 0.086*"cut"'),
 (3,
  '0.535*"council" + 0.437*"plan" + -0.357*"market" + -0.232*"business" + 0.231*"qld" + 0.209*"budget" + -0.190*"news" + -0.145*"analysis" + 0.126*"cut" + 0.110*"government"'),
 (4,
  '-0.575*"budget" + -0.557*"qld" + 0.461*"council" + 0.237*"plan" + -0.149*"government" + -0.128*"cut" + -0.070*"funding" + -0.057*"state" + -0.055*"health" + 0.044*"market"'),
 (5,
  '-0.663*"budget" + 0.514*"qld" + -0.312*"council" + 0.300*"plan" + 0.128*"

In [None]:
for t in range(lsi_model.num_topics):
    plt.figure(figsize=(20,10), facecolor='k')
    cloud = WordCloud(width=400,height=300,background_color='White')
    plt.imshow(cloud.fit_words(dict(lsi_model.show_topic(t, 200))),interpolation="bilinear")
    plt.axis("off")
    cloud.to_file("topic_%s.png" %str(t))
    plt.title("Topic --> " + str(t))
    plt.show()