## Example: NMF and LDA

In [1]:
import pandas as pd 
def trans_text(df):
    '''
    transform the text to list
    '''

    df = pd.read_csv(df, names = ['text'])
    text = df.text.tolist()

    return text

In [2]:
documents = pd.read_csv('../data/text_071031_180729.csv', header = None)

In [4]:
documents = documents.drop_duplicates()[0].tolist()

In [5]:
# Importing necessary packages
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [6]:
# Create bag of words
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=20, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.4, min_df=20, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [7]:
# Set parameters
no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [8]:
# Display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx), " ".join([feature_names[i]
                 
                                                   for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20
print("NMF Topics:")
display_topics(nmf, tfidf_feature_names, no_top_words)
print("")
print("LDA Topics:")
display_topics(lda, tf_feature_names, no_top_words)

NMF Topics:
Topic 0: siri apple iphone homepod hey ai ios like googleassistant know love shortcuts asked thanks applesupport use siriouslysusan better watch wwdc18
Topic 1: amazon echo ai amazonecho tech skills home smart devices amazonalexa commercial new laughing smarthome echodot skill technology business news best
Topic 2: weed buddy ask effects does make benefit flavor used tell best medicinal taste value feel going research search identify like
Topic 3: amazonecho homeautomation stand supercool upgrade dot echo smarthome echodot amazonalexa prime day googlehome smart home iot best deals 2nd win
Topic 4: cortana microsoft ai windows windows10 android integration halo assistant 10 artificialintelligence machinelearning googleassistant assistants tech invoke use pc work iot
Topic 5: google heygoogle assistant home smart website rankbio googlehome googleassistant rank ces2018 ai apple ces moz hey page optimize authority score
Topic 6: tell cleverclogs birthday need pay january search

In [10]:
# Interpreting topics (to be continued)

In [9]:
lda.perplexity(tf)

618.1807524013933