In [None]:
#Dependencies
import pandas as pd
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [4]:
df = pd.read_pickle('news_topic_modelling_0.pkl')
print(df.shape)
df

(10, 2)


Unnamed: 0,topic,text
0,corona,\n E-commerce companies can deliver essential ...
1,corona,\nGoa Chief Minister Pramod Sawant on Sunday s...
2,corona,\n Tamil Nadu has extended the coronavirus loc...
3,corona,\nSports complexes and stadia were on Sunday p...
4,corona,"\nMaharashtra on Sunday reported 2,347 coronav..."
5,iphone,\nThe news comes from Front Page Tech’s Jon Pr...
6,iphone,"\nYou can count on death, taxes, and a steady ..."
7,iphone,\nApple had been hoping to move past its recen...
8,iphone,\nThe iPhone 12 is supposed to deliver a small...
9,iphone,"\nThanks to a flurry of exciting leaks, Apple’..."


In [5]:
#clean the data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [6]:
df['text_clean']=df['text'].apply(clean)

In [7]:

#create dictionary
dictionary = corpora.Dictionary(df['text_clean'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

2130


In [8]:

#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['text_clean'] ]
print(len(doc_term_matrix))

10


In [9]:
lda = gensim.models.ldamodel.LdaModel

In [19]:

num_topics=6
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

Wall time: 680 ms


In [20]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.023*"zone" + 0.014*"allowed" + 0.014*"lockdown" + 0.010*"containment" + 0.010*"essential" + 0.007*"activity" + 0.007*"allow" + 0.007*"centre" + 0.007*"deliver" + 0.007*"phase"'),
 (1,
  '0.023*"said" + 0.015*"state" + 0.013*"covid19" + 0.013*"train" + 0.010*"passenger" + 0.010*"positive" + 0.010*"express" + 0.008*"patient" + 0.008*"people" + 0.008*"take"'),
 (2,
  '0.057*"iphone" + 0.028*"apple" + 0.017*"12" + 0.012*"new" + 0.010*"would" + 0.009*"prosser" + 0.009*"charging" + 0.009*"portless" + 0.008*"11" + 0.008*"model"'),
 (3,
  '0.019*"ipad" + 0.014*"tap" + 0.012*"app" + 0.011*"apple" + 0.011*"click" + 0.010*"use" + 0.010*"device" + 0.009*"screen" + 0.009*"free" + 0.008*"like"'),
 (4,
  '0.014*"io" + 0.013*"security" + 0.011*"user" + 0.010*"case" + 0.010*"exploit" + 0.010*"state" + 0.008*"new" + 0.007*"apple" + 0.007*"lockdown" + 0.006*"reported"'),
 (5,
  '0.014*"sport" + 0.009*"resumption" + 0.007*"sunday" + 0.007*"india" + 0.007*"athlete" + 0.007*"camp" + 0.007*"least" 

In [21]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [14]:
lda_corpus = ldamodel[doc_term_matrix]

In [15]:

[doc for doc in lda_corpus]

[[(0, 0.0018115373), (1, 0.9963859), (2, 0.001802596)],
 [(0, 0.0021795002), (1, 0.9956781), (2, 0.0021424019)],
 [(0, 0.0011576689), (1, 0.9976922), (2, 0.0011501585)],
 [(0, 0.7642158), (1, 0.23400915), (2, 0.0017750324)],
 [(0, 0.002390022), (1, 0.9952377), (2, 0.0023722842)],
 [(0, 0.9979932), (1, 0.0009908855), (2, 0.0010159459)],
 [(0, 0.000509617), (1, 0.00049094344), (2, 0.9989994)],
 [(0, 0.0008239481), (1, 0.0008222271), (2, 0.99835384)],
 [(0, 0.9977142), (1, 0.0011179815), (2, 0.0011677437)],
 [(0, 0.99860674), (1, 0.00069160707), (2, 0.00070163823)]]

In [16]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)


0.33333333491464145


In [17]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
# cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
# cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

print(len(cluster1))
print(len(cluster2))
print(len(cluster3))
# print(len(cluster4))
# print(len(cluster5))

4
4
2
