# Topic Modeling Assessment Project

A dataset of over 400,000 quora questions that have no labeled cateogry, and attempting to find 20 cateogries to assign these questions to. The .csv file of these text questions can be found underneath the Topic-Modeling folder.


####  Import pandas and read in the quora_questions.csv file.

In [23]:
import pandas as pd
import numpy as np

In [24]:
data = pd.read_csv('quora_questions.csv',sep = ',')

In [27]:
data.columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404289 entries, 0 to 404288
Data columns (total 1 columns):
Question    404289 non-null object
dtypes: object(1)
memory usage: 3.1+ MB


In [25]:
data.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing

####  Use TF-IDF Vectorization to create a vectorized document term matrix. You may want to explore the max_df and min_df parameters.

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
tv = TfidfVectorizer(max_df=0.95,min_df=2, stop_words='english')

In [44]:
data_tran = tv.fit_transform(data['Question'])

In [45]:
data_tran

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

# Non-negative Matrix Factorization

#### Using Scikit-Learn create an instance of NMF with 20 expected components. (Use random_state=42)..

In [155]:
from sklearn.decomposition import NMF

In [156]:
NMF = NMF(n_components=200, random_state=42)

In [157]:
NMF.fit(data_tran)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=200, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

#### Print our the top 15 most common words for each of the 20 topics.

In [55]:
for index, topic in enumerate(NMF.components_):
    print(f'\nTHE TOP 15 WORDS FOR TOPIC #{index}')
    print([tv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')


THE TOP 15 WORDS FOR TOPIC #0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']



THE TOP 15 WORDS FOR TOPIC #1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']



THE TOP 15 WORDS FOR TOPIC #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']



THE TOP 15 WORDS FOR TOPIC #3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']



THE TOP 15 WORDS FOR TOPIC #4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']



THE TOP 15 WORDS FOR TOPIC #5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'coun

####  Add a new column to the original quora dataframe that labels each question into one of the 20 topic categories.

In [158]:
data.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14


In [159]:
nmf_tran = NMF.transform(data_tran)


In [59]:
nmf_tran[0].argmax()

5

In [60]:
topic = nmf_tran.argmax(axis = 1)

In [62]:
topic[:10]

array([ 5, 16, 17, 11, 14,  1,  0, 10, 19, 17], dtype=int64)

In [64]:
data['Topic'] = topic
data.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14


# Trying vector on Topic 11

In [132]:
word_list=pd.Series( ['money', 'modi', 'currency', 'economy', 'think', 'government', 'ban', 'banning', 'black', 'indian', 'rupee', 'rs', '1000', 'notes', '500'])

In [66]:
import spacy, nltk
import en_core_web_md

#nltk.download('vader_lexicon')
nlp = en_core_web_md.load()

In [67]:
from scipy import spatial

In [151]:
def exp(c):  
    return c.apply(lambda x : nlp.vocab[x].vector)


def vector_math(word):
    
    computed_similarities = []
    cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
    
    word_vec=exp(word)
    
    new_vec = sum(word_vec)
    for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
        if word.has_vector:
            if word.is_lower:
                if word.is_alpha:
                    similarity = cosine_similarity(new_vec, word.vector)
                    computed_similarities.append((word, similarity))

    computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
    
    result = [w[0].text for w in computed_similarities[:10]]
    #print(result)
    return result
    

In [137]:
vector_math(word_list)

['moolah',
 'oney',
 'moeny',
 'money',
 'mone',
 'government',
 'dollor',
 'naira',
 'dollar',
 'bolivars']

In [152]:
topicword = []
for topic in NMF.components_[:3]:
    data_word =pd.Series([tv.get_feature_names()[i] for i in topic.argsort()[-50:]])
    print(data_word)
    topicword.append(vector_math(data_word))
   

0           games
1             seo
2            apps
3         service
4       beginners
5              ve
6       bangalore
7        websites
8           15000
9           study
10         course
11          songs
12       software
13      bollywood
14        digital
15          sites
16       download
17      institute
18     smartphone
19      marketing
20        company
21          delhi
22          watch
23         mobile
24       learning
25        android
26       coaching
27         online
28           free
29      hollywood
30            app
31    engineering
32         friend
33        website
34           site
35          thing
36           read
37          place
38          visit
39         places
40          phone
41            buy
42         laptop
43          movie
44           ways
45           2016
46          books
47           book
48         movies
49           best
dtype: object
0         actually
1            woman
2              com
3           iphone
4         

In [153]:
topicword

[['webmaster',
  'intranet',
  'web',
  'onine',
  'online',
  'onlines',
  'onlin',
  'intenet',
  'internet',
  'interent'],
 ['even',
  'because',
  'way',
  'what',
  'that',
  'but',
  'how',
  'could',
  'would',
  'there'],
 ['ask',
  'answer',
  'you',
  'sure',
  'if',
  'someone',
  'anyone',
  'want',
  'what',
  'tell']]