import pandas as pd

In [2]:
quora = pd.read_csv('quora_questions.csv')

In [3]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [6]:
#document term matrix
dtm = tfidf.fit_transform(quora['Question'])

In [7]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

# Non-negative Matrix Factorization

In [8]:
from sklearn.decomposition import NMF

In [9]:
nmf_model = NMF(n_components=20,random_state=42)

In [10]:
nmf_model.fit(dtm)



NMF(n_components=20, random_state=42)

#### The top 15 most common words for each of the 20 topics.

In [11]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 

In [12]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [13]:
topic_results = nmf_model.transform(dtm)

In [24]:
topic_results.argmax(axis=1)

quora['Topic'] = topic_results.argmax(axis=1)

quora.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,10
8,When do you use シ instead of し?,19
9,Motorola (company): Can I hack my Charter Moto...,17


In [33]:
topic = {0:'technology',1:'comparison',2:'quora',3:'earning online',4:'change',5:'company',
         6:'programming',7:'presidency',8:'war',9:'work culture',10:'business',11:'government',
         12:'resolutions',13:'communication',14:'exercise',15:'free time',16:'relationship',
         17:'internet',18:'engineering',19:'google questions'}

In [34]:
quora['Topic Name'] = quora['Topic'].map(topic)

In [35]:
quora.head(10)

Unnamed: 0,Question,Topic,Topic Name
0,What is the step by step guide to invest in sh...,5,company
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16,relationship
2,How can I increase the speed of my internet co...,17,internet
3,Why am I mentally very lonely? How can I solve...,11,government
4,"Which one dissolve in water quikly sugar, salt...",14,exercise
5,Astrology: I am a Capricorn Sun Cap moon and c...,1,comparison
6,Should I buy tiago?,0,technology
7,How can I be a good geologist?,10,business
8,When do you use シ instead of し?,19,google questions
9,Motorola (company): Can I hack my Charter Moto...,17,internet


## ----------------------------------------------------------------------------------------------------

# LDA

In [36]:
df = pd.read_csv('quora_questions.csv')

In [37]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [38]:
from sklearn.decomposition import LatentDirichletAllocation

In [39]:
LDA = LatentDirichletAllocation(n_components=12,random_state=42)

In [40]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=12, random_state=42)

In [44]:
len(LDA.components_)

12

In [45]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['difference', 'development', 'songs', 'india', 'heard', 'com', 'used', 'card', 'service', 'use', 'work', 'sentence', 'good', 'does', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['stop', 'does', 'indian', 'think', 'learn', 'best', 'modi', 'rupee', 'black', 'rs', 'india', 'english', 'notes', '1000', '500']


THE TOP 15 WORDS FOR TOPIC #2
['education', 'bollywood', 'movie', 'does', 'book', 'good', 'purpose', 'read', 'travel', 'favorite', 'books', 'movies', 'time', 'life', 'best']


THE TOP 15 WORDS FOR TOPIC #3
['movie', 'tv', 'asked', 'new', 'mind', 'iphone', 'best', 'answers', 'does', 'answer', 'people', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #4
['acne', 'hacker', 'password', 'fear', 'worst', 'visa', 'people', 'canada', 'depression', 'best', 'overcome', 'car', 'india', 'rid', 'does']


THE TOP 15 WORDS FOR TOPIC #5
['did', 'mean', 'relationship', 'women', 'pakistan', 'india', 'know', 'girl', 'world', 'war', 'feel', 'love', 'sex',

In [46]:
topics = LDA.transform(dtm)

In [47]:
topics.argmax(axis=1)

df['Topic'] = topics.argmax(axis=1)

df.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,7
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,4
2,How can I increase the speed of my internet co...,8
3,Why am I mentally very lonely? How can I solve...,5
4,"Which one dissolve in water quikly sugar, salt...",11
5,Astrology: I am a Capricorn Sun Cap moon and c...,2
6,Should I buy tiago?,4
7,How can I be a good geologist?,3
8,When do you use シ instead of し?,3
9,Motorola (company): Can I hack my Charter Moto...,8


In [51]:
mapping = {0:'lifestyle',1:'Indian economy',2:'entertainment',3:'Tech',4:'google question',5:'relationship',
         6:'health',7:'economics',8:'internet',9:'life',10:'programming',11:'engineering'}

In [52]:
df['Topic Name'] = df['Topic'].map(mapping)

In [53]:
df.head(10)

Unnamed: 0,Question,Topic,Topic Name
0,What is the step by step guide to invest in sh...,7,economics
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,4,google question
2,How can I increase the speed of my internet co...,8,internet
3,Why am I mentally very lonely? How can I solve...,5,relationship
4,"Which one dissolve in water quikly sugar, salt...",11,engineering
5,Astrology: I am a Capricorn Sun Cap moon and c...,2,entertainment
6,Should I buy tiago?,4,google question
7,How can I be a good geologist?,3,Tech
8,When do you use シ instead of し?,3,Tech
9,Motorola (company): Can I hack my Charter Moto...,8,internet
