In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
#non-negative matrix factorisation technique higher to lower dimensional vector

In [2]:
doc=["This news dataset is a persistent historical archive of noteable events in the Indian subcontinent from start-2001 to mid-2020, recorded in realtime by the journalists of India. It contains approximately 3.3 million events published by Times of India.",
     "A set of words that is complete in itself, typically containing a subject and predicate, conveying a statement, question, exclamation, or command, and consisting of a main clause and sometimes one or more subordinate clauses.",
    "What are the Augmentations that have worked for you?"]

In [3]:
# Creating sparse matrix
cv= CountVectorizer()
wordcountvector=cv.fit_transform(doc)

In [4]:
wordcountvector.shape

(3, 62)

In [5]:
wordcountvector.toarray()

array([[1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0,
        1, 2, 2, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 3, 0, 0, 1, 0, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1]],
      dtype=int64)

In [6]:
cv.get_feature_names()

['2001',
 '2020',
 'and',
 'approximately',
 'archive',
 'are',
 'augmentations',
 'by',
 'clause',
 'clauses',
 'command',
 'complete',
 'consisting',
 'containing',
 'contains',
 'conveying',
 'dataset',
 'events',
 'exclamation',
 'for',
 'from',
 'have',
 'historical',
 'in',
 'india',
 'indian',
 'is',
 'it',
 'itself',
 'journalists',
 'main',
 'mid',
 'million',
 'more',
 'news',
 'noteable',
 'of',
 'one',
 'or',
 'persistent',
 'predicate',
 'published',
 'question',
 'realtime',
 'recorded',
 'set',
 'sometimes',
 'start',
 'statement',
 'subcontinent',
 'subject',
 'subordinate',
 'that',
 'the',
 'this',
 'times',
 'to',
 'typically',
 'what',
 'words',
 'worked',
 'you']

In [8]:
featurenames=cv.get_feature_names()

In [10]:
#turn frequency times inverse document frequency

tt= TfidfTransformer(smooth_idf=True,use_idf=True)
tf_idf_vector = tt.fit_transform(wordcountvector)

In [11]:
query= tf_idf_vector[1]

In [12]:
df=pd.DataFrame(query.T.todense(),index=featurenames,columns=['tfidf'])

In [13]:
df.sort_values(by=['tfidf'],ascending=False)

Unnamed: 0,tfidf
and,0.486352
or,0.324235
of,0.246589
more,0.162117
sometimes,0.162117
...,...
journalists,0.000000
2020,0.000000
million,0.000000
news,0.000000


In [14]:
df= pd.read_csv('india-news-headlines.csv', error_bad_lines= False)

In [15]:

df.head(30)

Unnamed: 0,publish_date,headline_category,headline_text
0,20010101,sports.wwe,win over cena satisfying but defeating underta...
1,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
2,20010102,unknown,Fissures in Hurriyat over Pak visit
3,20010102,unknown,America's unwanted heading for India?
4,20010102,unknown,For bigwigs; it is destination Goa
5,20010102,unknown,Extra buses to clear tourist traffic
6,20010102,unknown,Dilute the power of transfers; says Riberio
7,20010102,unknown,Focus shifts to teaching of Hindi
8,20010102,unknown,IT will become compulsory in schools
9,20010102,unknown,Move to stop freedom fighters' pension flayed


In [27]:
datatext=df[['headline_text']].astype('str')
datatext.shape

(3297172, 1)

In [28]:
dt=datatext.loc[1:100000,:]

In [18]:
dt.shape

(100000, 1)

In [19]:
import nltk

In [20]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parul\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
sw= stopwords.words('english')
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
def sw_remove(x):
    t=x.split()
    t= [z for z in t if z not in sw ]
    sentence= ' '.join(t)
    return sentence

In [23]:
dt['Refinedheadlines']=dt['headline_text'].apply(lambda x:sw_remove(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
dt.head()

Unnamed: 0,headline_text,Refinedheadlines
1,Status quo will not be disturbed at Ayodhya; s...,Status quo disturbed Ayodhya; says Vajpayee
2,Fissures in Hurriyat over Pak visit,Fissures Hurriyat Pak visit
3,America's unwanted heading for India?,America's unwanted heading India?
4,For bigwigs; it is destination Goa,For bigwigs; destination Goa
5,Extra buses to clear tourist traffic,Extra buses clear tourist traffic


In [31]:
def wordcount(y):
    t=y.split()
    return len(t)

In [32]:
dt['word_count']=dt['Refinedheadlines'].apply(lambda y:wordcount(y))

KeyError: 'Refinedheadlines'

In [33]:
dt.head()

Unnamed: 0,headline_text
1,Status quo will not be disturbed at Ayodhya; s...
2,Fissures in Hurriyat over Pak visit
3,America's unwanted heading for India?
4,For bigwigs; it is destination Goa
5,Extra buses to clear tourist traffic


In [34]:
dt['word_count'].describe()

KeyError: 'word_count'

In [None]:
fig=plt.figure(figsize=(12,6))
plt.hist(dt['word_count'],bins=16,color='#60505C')
plt.title("Distribution of Article word count",fontsize=20)
plt.xlabel('word count',fontsize=15)
plt.ylabel('frequency',fontsize=15)
plt.show()

In [None]:

import seaborn as sns
sns.set_style('darkgrid')
fig = plt.figure(figsize=(5,10))
sns.boxplot(
        dt['word_count'],
        orient='v',
        width=0.6,
        color='#ff8080'
)

plt.ylabel("Word Count",fontsize=15)
plt.title("Distribution of Article word count",fontsize=16)

plt.show()

In [29]:
headline_sentences = [''.join(text) for text in dt['Refinedheadlines']]

KeyError: 'Refinedheadlines'

In [None]:

vectorizer = CountVectorizer(analyzer='word',max_features=5000)
x_counts = vectorizer.fit_transform(headline_sentences)

In [None]:
x_counts.toarray().shape

In [None]:
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)

In [None]:
x_tfidf

In [None]:
num_topics = 20
model = NMF(n_components = num_topics, init='nndsvd')
model.fit(x_tfidf)

In [37]:
def get_nmf_topics(model, n_top_words):
    
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        words_ids = model.components_[i].argsort()[:-n_top_words-1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic #'+'{:02d}'.format(i+1)] = words
        
    return pd.DataFrame(word_dict)

In [36]:
get_nmf_topics(model,50)

NameError: name 'get_nmf_topics' is not defined

In [35]:
model.components_

NameError: name 'model' is not defined