### Topic Modelling on News Article

In [1]:
#importing reuired libraries
import gensim

from nltk.stem import WordNetLemmatizer, SnowballStemmer
#creating instance for snowballstemmer
stemmer = SnowballStemmer("english")

In [2]:
#creating functions for processing part
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token).lower())
            
    return result

In [3]:
#extracting the saved text file from the directory
import glob
docs_text=[]  #store text of all documents
for files in glob.glob('*'): #all files in the present directory
    if(files.startswith('news1')):
        handle = open(files,'r',encoding="utf-8")
        docs_text.append(handle.read())  
        handle.close()

In [4]:
#displaying the content of file
docs_text

['  NEW DELHI: The industry on Monday said the steep 23.9 per cent contraction in the GDP in April-June was on expected lines reflecting the "stalling of economic activities" due to the lockdown imposed in response to coronavirus pandemic. The industry, however, said it anticipates the economy to stage a gradual recovery in the coming quarters on account of reforms, the Rs 20 lakh crore stimulus package and measures taken by the Reserve Bank. India\'s GDP shrank steeply by 23.9 per cent in the April-June period as the coronavirus lockdowns battered an already slowing economy. Agriculture was the only outlier as all other sectors, including manufacturing, construction and services, suffered steep declines. CII Director General Chandrajit Banerjee said the large contraction in the first quarter GDP print at 23.9 per cent was widely expected, and it reflects the wide-spread stalling of economic activities due to the stringent lockdown in response to the pandemic. "Even as the first half o

In [5]:
#processing the data for tokenization and lemmatization

processed_docs = []
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

for doc in docs_text:
    processed_docs.append(preprocess(doc))
processed_docs

[['delhi',
  'industri',
  'monday',
  'say',
  'steep',
  'cent',
  'contract',
  'april',
  'june',
  'expect',
  'line',
  'reflect',
  'stall',
  'econom',
  'activ',
  'lockdown',
  'impos',
  'respons',
  'coronavirus',
  'pandem',
  'industri',
  'say',
  'anticip',
  'economi',
  'stage',
  'gradual',
  'recoveri',
  'come',
  'quarter',
  'account',
  'reform',
  'lakh',
  'crore',
  'stimulus',
  'packag',
  'measur',
  'take',
  'reserv',
  'bank',
  'india',
  'shrink',
  'steepli',
  'cent',
  'april',
  'june',
  'period',
  'coronavirus',
  'lockdown',
  'batter',
  'slow',
  'economi',
  'agricultur',
  'outlier',
  'sector',
  'includ',
  'manufactur',
  'construct',
  'servic',
  'suffer',
  'steep',
  'declin',
  'director',
  'general',
  'chandrajit',
  'banerje',
  'say',
  'larg',
  'contract',
  'quarter',
  'print',
  'cent',
  'wide',
  'expect',
  'reflect',
  'wide',
  'spread',
  'stall',
  'econom',
  'activ',
  'stringent',
  'lockdown',
  'respons',
  'p

In [6]:
#bag of words on the data set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [7]:
print (dictionary)

Dictionary(314 unique tokens: ['accord', 'account', 'activ', 'add', 'addit']...)


In [8]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [9]:
#displaying the corpus saved
bow_corpus

[[(0, 1),
  (1, 1),
  (2, 2),
  (3, 3),
  (4, 1),
  (5, 2),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 3),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 2),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 5),
  (37, 1),
  (38, 1),
  (39, 4),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 3),
  (46, 1),
  (47, 1),
  (48, 2),
  (49, 1),
  (50, 1),
  (51, 3),
  (52, 3),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 2),
  (58, 15),
  (59, 5),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 2),
  (64, 2),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 3),
  (74, 1),
  (75, 3),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 3),
  (83, 5),
  (84, 2),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 2),
  (89, 1),
  (90, 1),
  (91, 1

In [10]:
#creating the model for topic modelling
lda_model =  gensim.models.ldamodel.LdaModel(bow_corpus, num_topics =20,id2word = dictionary,passes = 4,random_state=100,
                                           update_every=1,)

In [11]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.008*"say" + 0.007*"cours" + 0.007*"student" + 0.006*"job" + 0.005*"success" + 0.005*"work" + 0.005*"industri" + 0.005*"remot" + 0.004*"neet" + 0.004*"quarter"


Topic: 1 
Words: 0.003*"practic" + 0.003*"posit" + 0.003*"profession" + 0.003*"print" + 0.003*"prevail" + 0.003*"presid" + 0.003*"presenc" + 0.003*"prepar" + 0.003*"programm" + 0.003*"post"


Topic: 2 
Words: 0.011*"say" + 0.008*"cours" + 0.007*"student" + 0.006*"job" + 0.006*"quarter" + 0.005*"success" + 0.005*"remot" + 0.005*"industri" + 0.005*"neet" + 0.005*"work"


Topic: 3 
Words: 0.011*"say" + 0.009*"cours" + 0.008*"student" + 0.007*"job" + 0.006*"quarter" + 0.006*"work" + 0.005*"growth" + 0.005*"success" + 0.005*"time" + 0.005*"remot"


Topic: 4 
Words: 0.009*"say" + 0.007*"student" + 0.006*"cours" + 0.005*"quarter" + 0.005*"success" + 0.005*"neet" + 0.005*"remot" + 0.005*"rapid" + 0.004*"industri" + 0.004*"job"


Topic: 5 
Words: 0.008*"cours" + 0.007*"say" + 0.007*"student" + 0.005*"job" + 0.005*"qua

In [12]:
#extracting test set frpm the file folder
test_set=[]  #store text of all documents
for files in glob.glob('*'): #all files in the present directory
    if(files.startswith('news2')):
        handle = open(files,'r')
        test_set.append(handle.read())  
        handle.close()

In [13]:
test_set

[' On Monday, the government released economic data for the first quarter of the current fiscal year revealing a historic contraction of 23.9 per cent over the three months of April, May and June.\xa0 The COVID-19 outbreak, the subsequent lockdowns enforced by the central government and state governments, and the paralysis in business activity that they triggered, have led to the largest quarterly slump in India\'s GDP figures since such figures were first published (1996). India\'s GDP during the fourth quarter of the last fiscal stood at 3.1 per cent, with overall growth during the fiscal pegged at 5.2 per cent.\xa0 A contraction in India\'s GDP was largely expected by economists, however the majority had predicted a shrinkage no greater than 20 per cent. Moreover, the latest figures have also sparked suggestions that further downward revisions could be on the cards particularly because, given the lack of business activity that took place during the first quarter, there are concerns 

In [14]:
#processing for the test_set
processed_test=[]
for doc in test_set:
    processed_test.append(preprocess(doc))
processed_test

[['monday',
  'govern',
  'releas',
  'econom',
  'data',
  'quarter',
  'current',
  'fiscal',
  'year',
  'reveal',
  'histor',
  'contract',
  'cent',
  'month',
  'april',
  'june',
  'covid',
  'outbreak',
  'subsequ',
  'lockdown',
  'enforc',
  'central',
  'govern',
  'state',
  'govern',
  'paralysi',
  'busi',
  'activ',
  'trigger',
  'largest',
  'quarter',
  'slump',
  'india',
  'figur',
  'figur',
  'publish',
  'india',
  'fourth',
  'quarter',
  'fiscal',
  'stand',
  'cent',
  'overal',
  'growth',
  'fiscal',
  'peg',
  'cent',
  'contract',
  'india',
  'larg',
  'expect',
  'economist',
  'major',
  'predict',
  'shrinkag',
  'greater',
  'cent',
  'latest',
  'figur',
  'spark',
  'suggest',
  'downward',
  'revis',
  'card',
  'particular',
  'give',
  'lack',
  'busi',
  'activ',
  'take',
  'place',
  'quarter',
  'concern',
  'data',
  'collect',
  'robust',
  'typic',
  'contract',
  'cent',
  'simpli',
  'mean',
  'total',
  'valu',
  'good',
  'servic',
  '

In [19]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.008*"say" + 0.007*"cours" + 0.007*"student" + 0.006*"job" + 0.005*"success" + 0.005*"work" + 0.005*"industri" + 0.005*"remot" + 0.004*"neet" + 0.004*"quarter"


Topic: 1 
Words: 0.003*"practic" + 0.003*"posit" + 0.003*"profession" + 0.003*"print" + 0.003*"prevail" + 0.003*"presid" + 0.003*"presenc" + 0.003*"prepar" + 0.003*"programm" + 0.003*"post"


Topic: 2 
Words: 0.011*"say" + 0.008*"cours" + 0.007*"student" + 0.006*"job" + 0.006*"quarter" + 0.005*"success" + 0.005*"remot" + 0.005*"industri" + 0.005*"neet" + 0.005*"work"


Topic: 3 
Words: 0.011*"say" + 0.009*"cours" + 0.008*"student" + 0.007*"job" + 0.006*"quarter" + 0.006*"work" + 0.005*"growth" + 0.005*"success" + 0.005*"time" + 0.005*"remot"


Topic: 4 
Words: 0.009*"say" + 0.007*"student" + 0.006*"cours" + 0.005*"quarter" + 0.005*"success" + 0.005*"neet" + 0.005*"remot" + 0.005*"rapid" + 0.004*"industri" + 0.004*"job"


Topic: 5 
Words: 0.008*"cours" + 0.007*"say" + 0.007*"student" + 0.005*"job" + 0.005*"qua