In [32]:
#required libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from nltk.corpus import stopwords

In [33]:
headlines=pd.read_csv('india-news-headlines.csv')
display(headlines)

Unnamed: 0,publish_date,headline_category,headline_text
0,20010101,sports.wwe,win over cena satisfying but defeating underta...
1,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
2,20010102,unknown,Fissures in Hurriyat over Pak visit
3,20010102,unknown,America's unwanted heading for India?
4,20010102,unknown,For bigwigs; it is destination Goa
...,...,...,...
3297167,20200630,gadgets-news,why tiktok removed 1 65 crore videos in india
3297168,20200630,entertainment.hindi.bollywood,apurva asrani calls alia bhatts mother soni ra...
3297169,20200630,entertainment.hindi.bollywood,kangana ranaut gets a doll version of herself ...
3297170,20200630,entertainment.hindi.bollywood,meezaan jaffrey reminisces his childhood days ...


In [34]:
headlines=headlines.drop(['publish_date'],axis=1)
headlines=headlines[:100000]

In [35]:
headlines

Unnamed: 0,headline_category,headline_text
0,sports.wwe,win over cena satisfying but defeating underta...
1,unknown,Status quo will not be disturbed at Ayodhya; s...
2,unknown,Fissures in Hurriyat over Pak visit
3,unknown,America's unwanted heading for India?
4,unknown,For bigwigs; it is destination Goa
...,...,...
99995,unknown,UN mission begins Africa visit
99996,india,BJP Gujarat unit meeting to discuss strategy
99997,business.india-business,Forex reserves cross $55 billion
99998,india,Two more killed in Vadodara


In [36]:
#we need to remove stop words in headlines
stop_w=stopwords.words('english')
total_words=[]
def remove_stop_words(x):
    words=x.split()
    new=[w for w in words if w not in stop_w]
    total_words.append([w for w in words if w not in stop_w and total_words])
    sentence=' '.join(new)
    return sentence
headlines["new_headlines"]=headlines["headline_text"].apply(lambda x : remove_stop_words(x))

In [37]:
headlines["new_headlines"]

0        win cena satisfying defeating undertaker bigge...
1              Status quo disturbed Ayodhya; says Vajpayee
2                              Fissures Hurriyat Pak visit
3                        America's unwanted heading India?
4                             For bigwigs; destination Goa
                               ...                        
99995                       UN mission begins Africa visit
99996            BJP Gujarat unit meeting discuss strategy
99997                     Forex reserves cross $55 billion
99998                                  Two killed Vadodara
99999                  NCP vote NDA govt Lok Sabha: Sangma
Name: new_headlines, Length: 100000, dtype: object

In [38]:
total_words

[[],
 ['Status', 'quo', 'disturbed', 'Ayodhya;', 'says', 'Vajpayee'],
 ['Fissures', 'Hurriyat', 'Pak', 'visit'],
 ["America's", 'unwanted', 'heading', 'India?'],
 ['For', 'bigwigs;', 'destination', 'Goa'],
 ['Extra', 'buses', 'clear', 'tourist', 'traffic'],
 ['Dilute', 'power', 'transfers;', 'says', 'Riberio'],
 ['Focus', 'shifts', 'teaching', 'Hindi'],
 ['IT', 'become', 'compulsory', 'schools'],
 ['Move', 'stop', 'freedom', "fighters'", 'pension', 'flayed'],
 ['Gilani', 'claims', 'applied', 'passport', '2', 'years', 'ago'],
 ['India;', 'Pak', 'exchange', 'lists', 'N-plants'],
 ['Will', "Qureshi's", 'return', 'really', 'help', 'govt?'],
 ["PM's", 'tacit', 'message:', 'Put', 'Ram', 'tample', 'hold'],
 ['Text', 'Prime', "Minister's", 'article'],
 ['NCW', 'focus', 'violence', 'women'],
 ["BBC's", 'reality', 'TV', 'focus', 'AIIMS'],
 ['Jaitley', 'firm', 'legal', 'reforms'],
 ['Hoshangabad', 'farmers', 'enough', 'water'],
 ['BJP', 'jumps', 'rail', 'track', 'conversion', 'issue'],
 ["America

In [39]:
cv=CountVectorizer()
word_count_vector=cv.fit_transform(headlines['new_headlines'])

In [40]:
#this is word document co-occurence matrix
word_count_vector,word_count_vector.shape

(<100000x36612 sparse matrix of type '<class 'numpy.int64'>'
 	with 475117 stored elements in Compressed Sparse Row format>,
 (100000, 36612))

In [41]:
tfidf=TfidfTransformer(smooth_idf=False)
tf_idf_vector=tfidf.fit_transform(word_count_vector)

In [42]:
num_topics=20
model=NMF(n_components=num_topics,init='nndsvd')
model.fit(normalize(tf_idf_vector,norm='l1',axis=1))

NMF(init='nndsvd', n_components=20)

In [43]:
len(cv.get_feature_names())

36612

In [44]:
def top_words_topic(model,n_top_words):
    features=cv.get_feature_names()
    word_dict={}
    for i in range(num_topics):
        word_idx=model.components_[i].argsort()[:-n_top_words-1:-1]
        words=[features[key] for key in word_idx]
        word_dict['topic '+str(i)]=words
    return pd.DataFrame(word_dict)

In [45]:
words=top_words_topic(model,20)
words

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9,topic 10,topic 11,topic 12,topic 13,topic 14,topic 15,topic 16,topic 17,topic 18,topic 19
0,city,the,govt,killed,india,bjp,police,it,new,power,no,held,statescan,day,meet,life,us,cm,case,cong
1,round,counsellor,staff,militants,pak,up,delhi,time,year,supply,says,man,busted,security,today,get,pak,state,hc,polls
2,sports,week,state,two,china,polls,station,get,chief,tariff,time,two,seminar,one,pm,ends,indian,centre,murder,poll
3,scan,mall,may,road,terrorism,congress,custody,says,soon,hike,minister,murder,gang,celebrated,all,woman,may,says,plea,leader
4,briefs,art,set,kashmir,talks,poll,firing,back,look,cuts,please,rs,tv,celebrations,begins,term,pm,water,bail,demands
5,up,good,rs,encounter,musharraf,sp,chief,bangalore,policy,cut,yet,fake,listing,today,party,love,says,rs,court,leaders
6,old,man,schools,one,pakistan,pm,officers,family,york,people,change,three,voters,strike,women,sentenced,war,gujarat,accused,may
7,lights,way,act,mishap,air,gujarat,traffic,khan,set,water,water,one,navaratri,valentine,water,man,help,seeks,sc,chief
8,jan,sound,oppn,four,world,modi,alert,industry,old,situation,up,seized,brahmotsavam,teachers,tomorrow,bandh,indo,delhi,probe,mla
9,march,word,says,injured,ties,leader,arrest,love,get,reforms,takers,women,health,world,national,normal,terrorism,minister,arrested,back


In [46]:
uk=np.array(headlines["headline_category"])
unq_head=np.unique(uk)
unq_head

array(['afghan-children', 'ahmedabad-times', 'analysis',
       'anti-terror-law', 'astrology.horoscope', 'ayodhya-imbroglio',
       'bangalore-times', 'blogs', 'bombay-times', 'business',
       'business.india-business', 'business.international-business',
       'calcutta-times', 'citizens-grievances', 'city', 'city.ahmedabad',
       'city.bengaluru', 'city.chandigarh', 'city.chennai', 'city.delhi',
       'city.hyderabad', 'city.kolkata', 'city.lucknow', 'city.mumbai',
       'city.patna', 'city.pune', 'city.thiruvananthapuram', 'cricket',
       'delhi-times', 'edit-page', 'entertainment.english.hollywood',
       'entertainment.hindi.bollywood', 'food-facts', 'gadgets-news',
       'home.education', 'home.science', 'home.sunday-times',
       'home.sunday-times.all-that-matters',
       'home.sunday-times.deep-focus', 'hyderabad-times',
       'hyderabad.local-sports', 'india', 'interviews', 'jugular-vein',
       'life-style.events', 'lucknow-times', 'news', 'only-in-america',


In [50]:
unq_head=np.delete(unq_head,np.where(unq_head=="unknown"))
unq_head.shape

(64,)

In [49]:
cv=CountVectorizer()
head_count_vector=cv.fit_transform(unq_head)
model1=NMF(n_components=num_topics,init='nndsvd')
model1.fit(normalize(tfidf.fit_transform(head_count_vector),norm="l1",axis=1))

NMF(init='nndsvd', n_components=20)

In [51]:
topics=top_words_topic(model1,1)

In [52]:
topics

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9,topic 10,topic 11,topic 12,topic 13,topic 14,topic 15,topic 16,topic 17,topic 18,topic 19
0,city,business,times,news,india,world,cricket,recipes,removed,interviews,blogs,analysis,swaminomics,sports,home,hyderabad,delhi,pune,ahmedabad,afghan


In [54]:
words.columns=topics.iloc[0,:]

In [55]:
words

Unnamed: 0,city,business,times,news,india,world,cricket,recipes,removed,interviews,blogs,analysis,swaminomics,sports,home,hyderabad,delhi,pune,ahmedabad,afghan
0,city,the,govt,killed,india,bjp,police,it,new,power,no,held,statescan,day,meet,life,us,cm,case,cong
1,round,counsellor,staff,militants,pak,up,delhi,time,year,supply,says,man,busted,security,today,get,pak,state,hc,polls
2,sports,week,state,two,china,polls,station,get,chief,tariff,time,two,seminar,one,pm,ends,indian,centre,murder,poll
3,scan,mall,may,road,terrorism,congress,custody,says,soon,hike,minister,murder,gang,celebrated,all,woman,may,says,plea,leader
4,briefs,art,set,kashmir,talks,poll,firing,back,look,cuts,please,rs,tv,celebrations,begins,term,pm,water,bail,demands
5,up,good,rs,encounter,musharraf,sp,chief,bangalore,policy,cut,yet,fake,listing,today,party,love,says,rs,court,leaders
6,old,man,schools,one,pakistan,pm,officers,family,york,people,change,three,voters,strike,women,sentenced,war,gujarat,accused,may
7,lights,way,act,mishap,air,gujarat,traffic,khan,set,water,water,one,navaratri,valentine,water,man,help,seeks,sc,chief
8,jan,sound,oppn,four,world,modi,alert,industry,old,situation,up,seized,brahmotsavam,teachers,tomorrow,bandh,indo,delhi,probe,mla
9,march,word,says,injured,ties,leader,arrest,love,get,reforms,takers,women,health,world,national,normal,terrorism,minister,arrested,back
