In [1]:
import pandas as pd 
npr = pd.read_csv("npr.csv")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2)

In [4]:
dtm = tfidf.fit_transform(npr['Article'])

In [5]:
dtm

<11992x55071 sparse matrix of type '<class 'numpy.float64'>'
	with 3949008 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.decomposition import NMF

In [7]:
nmf_model = NMF(n_components=6, random_state=42)

In [8]:
nmf_model.fit(dtm)

In [9]:
for index,topic in enumerate(nmf_model.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{index}")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])

THE TOP 15 WORDS FOR TOPIC #0
['me', 'about', 'so', 'but', 'what', 'like', 'they', 'with', 'this', 'my', 'his', 'was', 'we', 'he', 'you']
THE TOP 15 WORDS FOR TOPIC #1
['russia', 'as', 'with', 'republican', 'has', 'obama', 'white', 'donald', 'campaign', 'house', 'said', 'president', 'his', 'he', 'trump']
THE TOP 15 WORDS FOR TOPIC #2
['we', 'with', 'or', 'be', 'their', 'more', 'people', 'percent', 'have', 'insurance', 'they', 'care', 'are', 'says', 'health']
THE TOP 15 WORDS FOR TOPIC #3
['at', 'family', 'when', 'who', 'mother', 'had', 'me', 'woman', 'with', 'my', 'says', 'women', 'was', 'her', 'she']
THE TOP 15 WORDS FOR TOPIC #4
['have', 'with', 'at', 'court', 'reports', 'from', 'his', 'were', 'has', 'by', 'as', 'he', 'was', 'said', 'police']
THE TOP 15 WORDS FOR TOPIC #5
['cruz', 'republican', 'election', 'percent', 'democrats', 'party', 'delegates', 'vote', 'state', 'hillary', 'democratic', 'campaign', 'voters', 'sanders', 'clinton']


In [10]:
topic_results = nmf_model.transform(dtm)

In [11]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 3, 5, 4])

In [12]:
# Forming the Topic block in the npr table
npr["Topic"] = topic_results.argmax(axis=1)

In [13]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",4


In [14]:
# Let's assign a Topic Label to each of the topics

In [15]:
my_topic_dict = {0:'nouns', 1:'election', 2:'health', 3:'women', 4:'court', 5:'government'}
npr['Topic_Label'] = npr['Topic'].map(my_topic_dict)

In [16]:
npr.head()

Unnamed: 0,Article,Topic,Topic_Label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1,election
4,"From photography, illustration and video, to d...",4,court
