In [13]:
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
df = pd.read_csv('../DATA/npr.csv')
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
df.shape

(11992, 1)

In [7]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')
dtm = tfidf.fit_transform(df.Article)
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
dtm.shape

(11992, 54777)

In [8]:
nmf = NMF(n_components=10,random_state=101)
nmf.fit(dtm)



NMF(n_components=10, random_state=101)

In [11]:
tfidf.get_feature_names_out()[1100]

'859'

In [12]:
tfidf.get_feature_names_out()[2837]

'anaphylactic'

In [15]:
tfidf.get_feature_names_out()[random.randint(0,54777)]

'racing'

In [16]:
nmf.components_

array([[0.0007599 , 0.26691419, 0.        , ..., 0.00266853, 0.0003554 ,
        0.        ],
       [0.        , 0.00066855, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.09804495, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.03092522, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0043057 , 0.04828948, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00444242, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [17]:
nmf.components_.shape

(10, 54777)

In [18]:
first_topic = nmf.components_[0]
first_topic

array([0.0007599 , 0.26691419, 0.        , ..., 0.00266853, 0.0003554 ,
       0.        ])

In [21]:
first_topic.argsort()

array([27388, 27031, 27030, ..., 19307, 36283, 42993], dtype=int64)

In [23]:
for idx in first_topic.argsort()[-10:]:
    print(tfidf.get_feature_names_out()[idx])

just
company
study
new
percent
like
water
food
people
says


In [26]:
for idx, topic in enumerate(nmf.components_):
    print(f"Top 15 words for topic #{idx}: ")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words for topic #0: 
['year', 'university', 'workers', '000', 'years', 'just', 'company', 'study', 'new', 'percent', 'like', 'water', 'food', 'people', 'says']


Top 15 words for topic #1: 
['administration', 'cruz', 'election', 'pence', 'gop', 'presidential', 'obama', 'house', 'white', 'republican', 'donald', 'campaign', 'said', 'president', 'trump']


Top 15 words for topic #2: 
['patients', 'repeal', 'law', 'act', 'republicans', 'tax', 'people', 'plan', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


Top 15 words for topic #3: 
['assad', 'iran', 'iraq', 'north', 'china', 'aleppo', 'war', 'korea', 'said', 'forces', 'russia', 'military', 'syrian', 'syria', 'isis']


Top 15 words for topic #4: 
['cruz', 'election', 'primary', 'democrats', 'percent', 'party', 'vote', 'state', 'delegates', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


Top 15 words for topic #5: 
['book', 'love', 'women', 'way', 'time', 'life', 'album', '

In [27]:
topic_results = nmf.transform(dtm)
topic_results

array([[0.        , 0.0995477 , 0.        , ..., 0.        , 0.        ,
        0.07561648],
       [0.00095702, 0.12542556, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.12830495, 0.        , ..., 0.        , 0.        ,
        0.04176065],
       ...,
       [0.00787994, 0.00090828, 0.00794857, ..., 0.06561726, 0.01213194,
        0.00221326],
       [0.        , 0.02816833, 0.00146759, ..., 0.        , 0.        ,
        0.04274406],
       [0.02511506, 0.00721866, 0.00166085, ..., 0.        , 0.01079484,
        0.00296728]])

In [29]:
len(topic_results.argmax(axis=1))

11992

In [30]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 7, 4, 0], dtype=int64)

In [31]:
df['Topic'] = topic_results.argmax(axis=1)

In [32]:
topic_dict = {0: 'Education', 1: 'Election', 2: 'Legislation', 3: 'Terrorism', 4: 'Election', 5: 'Music', 6: 'Education', 7: 'Disease', 8: 'Terrorism', 9: 'Politics'}
df['Topic Label'] = df['Topic'].map(topic_dict)

In [33]:
df.head()

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,Election
1,Donald Trump has used Twitter — his prefe...,1,Election
2,Donald Trump is unabashedly praising Russian...,1,Election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Terrorism
4,"From photography, illustration and video, to d...",6,Education
