# Non-negative Matrix Factorization (NMF)

In [1]:
import pandas as pd
npr = pd.read_csv('./UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv')

## Preprocessing

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


**`max_df`**` : float in range [0.0, 1.0] or int, default=1.0`<br>
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**`min_df`**` : float in range [0.0, 1.0] or int, default=1`<br>
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [3]:
# max_df = max number or perentage of docs that can contain words
# min_df = min number or percentage that words must show up in
# stop_words = language library of stop words (i.e., a, the, it, in, etc.**`max_df`**` : float in range [0.0, 1.0] or int, default=1.0`<br>
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [4]:
dtm = tfidf.fit_transform(npr['Article'])

In [5]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

## NMF

In [6]:
from sklearn.decomposition import NMF

In [7]:
nmf_model = NMF(n_components=7,random_state=42)

In [8]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [9]:
tfidf.get_feature_names()[2300]

'albala'

In [10]:
for index,topic in enumerate(nmf_model.components_):
    print(f"The top 15 words for topic # {index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic # 0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


The top 15 words for topic # 1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


The top 15 words for topic # 2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The top 15 words for topic # 3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


The top 15 words for topic # 4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


The top 15 words for topic # 5
['love', 've', 'don

In [11]:
topic_results = nmf_model.transform(dtm)

In [12]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [13]:
topic_results.argmax()

22021

In [14]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3], dtype=int64)

In [15]:
npr['Topic'] = topic_results.argmax(axis=1)

In [16]:
mytopic_dict = {0:'health',1:'election',2:'legis',3:'poli',4:'election',5:'music',6:'education'}
npr['Topic'] =npr['Topic'].map(mytopic_dict)

In [17]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",election
1,Donald Trump has used Twitter — his prefe...,election
2,Donald Trump is unabashedly praising Russian...,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",poli
4,"From photography, illustration and video, to d...",education
