## Looking at the Quora Questions Dataset

In [2]:
import pandas as pd
quora_question = pd.read_csv('quora_questions.csv')

In [3]:
quora_question.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


### Length of the Dataset

In [4]:
len(quora_question)

404289

### Looking at a random question from the dataset to understand the structure of dataset

In [9]:
quora_question.at[2030, 'Question']

'Is our PM Modi doing the correct thing with 500 and 1000 Rs notes?'

### Since our objective is to categorize articles based on topics, we will now form clusters to group related articles together to form topics using NMF

### Preprocessing the Data

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer 
cv = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [12]:
dtm = cv.fit_transform(quora_question['Question'])

In [13]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2002912 stored elements and shape (404289, 38669)>

## Non - Negative Matrix Factorization

In [14]:
from sklearn.decomposition import NMF
quora_question_model = NMF(n_components=7,random_state=45)
quora_question_model.fit(dtm)


### Displaying the topics 

In [15]:
len(cv.get_feature_names_out())

38669

In [17]:
import random
for i in range(10):
    random_word_id = random.randint(0,38669)
    print(cv.get_feature_names_out()[random_word_id])

bilawal
technical
softbank
velachery
monte
seats
magnitude
nerdwallet
restore
al


In [18]:
len(quora_question_model.components_)

7

In [20]:
quora_question_model.components_

array([[1.02890805e-04, 5.08352442e-02, 4.64121851e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.61603405e-03, 2.56848916e-03, 3.36883891e-05, ...,
        2.98537355e-06, 2.77441783e-03, 2.98537355e-06],
       [3.88415653e-05, 0.00000000e+00, 2.14783237e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.71198977e-04, 3.33360672e-03, 0.00000000e+00, ...,
        2.12959603e-06, 0.00000000e+00, 2.12959603e-06],
       [9.25723812e-04, 1.18223930e-02, 0.00000000e+00, ...,
        8.53092200e-06, 0.00000000e+00, 8.53092200e-06],
       [4.60215704e-04, 0.00000000e+00, 0.00000000e+00, ...,
        2.58055049e-05, 0.00000000e+00, 2.58055049e-05]])

In [21]:
len(quora_question_model.components_[0])

38669

In [23]:
single_topic = quora_question_model.components_[0]

In [24]:
single_topic.argsort()

array([38656,    47,    46, ..., 22925, 37515,  4632])

In [29]:
# Word least representative of this topic
single_topic[47]

np.float64(0.0)

In [30]:
# Word most reprsentative of this topic
single_topic[4632]

np.float64(7.172441193057282)

In [31]:
# Top 10 words in this topic
top_10_words = single_topic.argsort()[-10:]
top_10_words

array([34869, 22924, 37520,   482, 37630,  5283,  5268, 22925, 37515,
        4632])

In [32]:
for index in top_10_words:
    print(cv.get_feature_names_out()[index])

time
movie
ways
2016
weight
books
book
movies
way
best


### Before we assign a label number to each article using .transform(), let's first take a look at the 10 topics that were identified

In [33]:
for index,topic in enumerate(quora_question_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['phone', 'india', 'lose', 'buy', 'laptop', 'time', 'movie', 'ways', '2016', 'weight', 'books', 'book', 'movies', 'way', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['new', 'compare', 'look', 'cost', 'really', 'girl', 'love', 'long', 'sex', 'time', 'work', 'feel', 'like', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['post', 'answered', 'use', 'improvement', 'delete', 'easily', 'asked', 'google', 'answer', 'answers', 'ask', 'question', 'questions', 'people', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['easiest', 'rupee', 'home', 'easy', 'notes', '1000', '500', 'black', 'youtube', 'ways', 'way', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['moment', 'live', 'employees', 'like', 'want', 'real', 'love', 'things', 'day', 'important', 'thing', 'meaning', 'know', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['election', 'war', '1000', 'people', 'notes', '500', 'win', 'think', 'did', 'hillary', 'clinton', 'president', 'donald', 'trump', 

### Mapping the Discovered Topics Back to the Original Articles

In [34]:
dtm.shape

(404289, 38669)

In [37]:
len(quora_question)

404289

In [38]:
topic_results = quora_question_model.transform(dtm)
topic_results.shape

(404289, 7)

In [39]:
topic_results[0]

array([0.00104795, 0.        , 0.        , 0.00039835, 0.        ,
       0.02284856, 0.00018147])

In [40]:
topic_results[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.  ])

In [42]:
topic_results[0].argmax()

np.int64(5)

### This implies that our model thinks that question at index 0 belongs to topic 5

### Combining with our original data

In [43]:
quora_question

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."
...,...
404284,How many keywords are there in the Racket prog...
404285,Do you believe there is life after death?
404286,What is one coin?
404287,What is the approx annual cost of living while...


In [44]:
topic_results.argmax(axis=1)

array([5, 4, 3, ..., 5, 5, 1])

In [45]:
quora_question['Topic'] = topic_results.argmax(axis=1)

In [47]:
quora_question.head(15)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,4
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,1
4,"Which one dissolve in water quikly sugar, salt...",1
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,6
8,When do you use シ instead of し?,2
9,Motorola (company): Can I hack my Charter Moto...,0
