In [2]:
import pandas as pd

In [3]:
dataset = pd.read_csv("Full Set.csv")

In [6]:
mylist = dataset["Full Text"]

In [7]:
mylist.head()

0      Though Brooklyn has its share of hybrid oper...
1      Big Brother isn't watching you. You're watch...
2      Despite being declared dead quite a few time...
3      CORRECTION: This story contains corrected ma...
4      PALO ALTO, Calif.--Software worker Erin Bell...
Name: Full Text, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 100

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(mylist)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(mylist)
tf_feature_names = tf_vectorizer.get_feature_names()

In [9]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [10]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        #print(topic_idx)
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 3
display_topics(nmf, tfidf_feature_names, no_top_words)
print("\n")
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
said company people
Topic 1:
company billion market
Topic 2:
facebook ads ad
Topic 3:
mr ms chief
Topic 4:
york new times
Topic 5:
uber driving self
Topic 6:
data privacy facebook
Topic 7:
trump president government
Topic 8:
app apple phone
Topic 9:
like people just


Topic 0:
uber driving car
Topic 1:
apple app phone
Topic 2:
search like use
Topic 3:
facebook media social
Topic 4:
company year amazon
Topic 5:
china says internet
Topic 6:
mr said company
Topic 7:
new trump like
Topic 8:
ads youtube ad
Topic 9:
data companies privacy
