# Non-Negative Matric Factorization


In [49]:
import pandas as pd

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/quora_questions.csv')

In [52]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


## Preprocessing

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [55]:
dtm = tfidf.fit_transform(df['Question'])

In [56]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2002912 stored elements and shape (404289, 38669)>

## NMF

In [57]:
from sklearn.decomposition import NMF

In [58]:
nmf_model = NMF(n_components=7,random_state=42)

In [59]:
# This can take awhile, we're dealing with a large amount of documents!
nmf_model.fit(dtm)

In [60]:
len(tfidf.get_feature_names_out())

38669

In [61]:
import random

In [62]:
for i in range(10):
    random_word_id = random.randint(0, len(tfidf.get_feature_names_out()) - 1)
    print(tfidf.get_feature_names_out()[random_word_id])

lsat
pot
thenkalai
paralympic
trusted
vilain
ingredient
commenting
banking
deteriorating


In [63]:
for i in range(10):
    random_word_id = random.randint(0,len(tfidf.get_feature_names_out()))
    print(tfidf.get_feature_names_out()[random_word_id])

cardiac
indolence
kingdom
180ml
firm
cultural
oxymoronic
characteristic
robertson
backyard


In [64]:
len(nmf_model.components_)

7

In [65]:
nmf_model.components_

array([[1.03113789e-04, 5.09548275e-02, 4.65202987e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.61794178e-03, 2.57529191e-03, 3.37224742e-05, ...,
        2.99951904e-06, 2.77689266e-03, 2.99951904e-06],
       [3.89554723e-05, 0.00000000e+00, 2.14858894e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.74773997e-04, 3.35059991e-03, 0.00000000e+00, ...,
        2.11502448e-06, 0.00000000e+00, 2.11502448e-06],
       [9.21440031e-04, 1.17671113e-02, 0.00000000e+00, ...,
        8.48590528e-06, 0.00000000e+00, 8.48590528e-06],
       [4.65815285e-04, 0.00000000e+00, 0.00000000e+00, ...,
        2.61140276e-05, 0.00000000e+00, 2.61140276e-05]])

In [66]:
len(nmf_model.components_[0])

38669

In [67]:
single_topic = nmf_model.components_[0]

In [68]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([23568, 35026, 35031, ..., 22925, 37515,  4632])

In [69]:
# Word least representative of this topic
single_topic[single_topic.argmin()]

np.float64(0.0)

In [70]:
# Word most representative of this topic
single_topic[single_topic.argmax()]

np.float64(7.189285187928539)

In [71]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([34869, 22924, 37520,   482, 37630,  5283,  5268, 22925, 37515,
        4632])

In [72]:
top_word_indices = single_topic.argsort()[-10:]

In [73]:
for index in top_word_indices:
    print(tfidf.get_feature_names_out()[index])

time
movie
ways
2016
weight
books
book
movies
way
best


let's view all the 7 topics found.

In [74]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['phone', 'india', 'lose', 'buy', 'laptop', 'time', 'movie', 'ways', '2016', 'weight', 'books', 'book', 'movies', 'way', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['new', 'compare', 'look', 'cost', 'really', 'girl', 'love', 'long', 'sex', 'time', 'work', 'feel', 'like', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['post', 'answered', 'use', 'improvement', 'delete', 'easily', 'asked', 'google', 'answer', 'answers', 'ask', 'question', 'questions', 'people', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['easiest', 'rupee', 'home', 'easy', 'notes', '1000', '500', 'black', 'youtube', 'ways', 'way', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['moment', 'live', 'employees', 'like', 'want', 'real', 'love', 'things', 'day', 'important', 'thing', 'know', 'meaning', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['election', 'war', '1000', 'people', 'notes', '500', 'win', 'think', 'did', 'hillary', 'clinton', 'president', 'donald', 'trump', 

### Attaching Discovered Question Labels to Original Question

In [75]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2002912 stored elements and shape (404289, 38669)>

In [76]:
dtm.shape

(404289, 38669)

In [77]:
len(df)

404289

In [78]:
topic_results = nmf_model.transform(dtm)

In [79]:
topic_results.shape

(404289, 7)

In [80]:
topic_results[0]

array([0.00104604, 0.        , 0.        , 0.00039873, 0.        ,
       0.02294911, 0.00017901])

In [81]:
topic_results[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.  ])

In [82]:
topic_results[0].argmax()

np.int64(5)

This means that our model thinks that the first question belongs to topic #5.

### Combining with Original Data

In [83]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [84]:
topic_results.argmax(axis=1)

array([5, 4, 3, ..., 5, 5, 1])

In [85]:
df['Question'] = topic_results.argmax(axis=1)

In [86]:
df.head(10)

Unnamed: 0,Question
0,5
1,4
2,3
3,1
4,1
5,1
6,0
7,6
8,2
9,0
