In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

In [2]:
hm_data = pd.read_csv('../output/subset_hm.csv')

In [19]:
hm = hm_data['processed_hm']
hm_data.head()

Unnamed: 0.1,Unnamed: 0,wid,original_hm,cleaned_hm,predicted_category,processed_hm,age,country,gender,marital,parenthood
0,0,2053,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,affection,"successful,date,someone,felt,sympathy,connection",35.0,USA,m,single,n
1,1,2053,I played a new game that was fun and got to en...,I played a new game that was fun and got to en...,leisure,"play,new,game,fun,enjoy,mechanic",35.0,USA,m,single,n
2,2,2053,I listened to some music and heard an entire a...,I listened to some music and heard an entire a...,leisure,"listen,music,hear,entire,album,n't,hear",35.0,USA,m,single,n
3,3,2053,Went to see a movie with my friend,Went to see a movie with my friend,bonding,"went,see,movie,friend",35.0,USA,m,single,n
4,4,2053,"Played guitar, learning a song on it","Played guitar, learning a song on it",leisure,"played,guitar,learn,song",35.0,USA,m,single,n


In [20]:
hm = [x for x in hm]

### Non-negative Matrix Factorization (NMF)

In [26]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(hm)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
no_topics = 20
no_top_words = 10
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
friend best meet talk old party hang visit close lunch
Topic 1:
new car purchase start phone try apartment puppy computer brand
Topic 2:
work finish project early receive raise promotion complete bonus hard
Topic 3:
dinner wife cook husband nice delicious restaurant tonight pizza chicken
Topic 4:
play game video win team basketball baseball fun online favorite
Topic 5:
time long spend great talk quality felt saw spending brother
Topic 6:
watch movie favorite episode season funny netflix enjoy television youtube
Topic 7:
good feel felt grade workout news food deal life taste
Topic 8:
son year old school meet love tell say start hug
Topic 9:
family member enjoy vacation trip spend easter moment celebrate beach
Topic 10:
buy car ticket want store house money sale phone shoe
Topic 11:
dog walk park morning play nice outside long run weather
Topic 12:
able finally pay help money car fix event hour need
Topic 13:
eat favorite lunch food restaurant delicious pizza meal cream ice
Topi

In [121]:
W = nmf.fit_transform(tfidf)

In [115]:
H = nmf.components_

In [132]:
import numpy

threshold = 0.05
_idx = np.amax(W, axis=1) > threshold  # idx of doc that above the threshold
W = W[_idx]

In [133]:
W.shape

(5719, 20)

In [134]:
from sklearn.manifold import TSNE

# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(W)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5719 samples in 0.033s...
[t-SNE] Computed neighbors for 5719 samples in 0.564s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5719
[t-SNE] Computed conditional probabilities for sample 2000 / 5719
[t-SNE] Computed conditional probabilities for sample 3000 / 5719
[t-SNE] Computed conditional probabilities for sample 4000 / 5719
[t-SNE] Computed conditional probabilities for sample 5000 / 5719
[t-SNE] Computed conditional probabilities for sample 5719 / 5719
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.007389
[t-SNE] KL divergence after 1000 iterations: 0.258291


In [135]:
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [136]:
lda_keys = []
for i in range(W.shape[0]):
  lda_keys +=  W[i].argmax(),

In [137]:
topic_summaries = []
for i, topic_dist in enumerate(nmf.components_):
  topic_words = np.array(tfidf_feature_names)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [138]:
title = 'NMF'
num_example = len(W)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[lda_keys][:num_example],
                 )
#source=bp.ColumnDataSource(
#                     data=dict(x = hm[:num_example], y = lda_keys[:num_example])
#                     )

In [139]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((W.shape[1], 2)) * np.nan
for topic_num in lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[lda_keys.index(topic_num)]

# plot crucial words
for i in range(W.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/james/Desktop/Lectures/GR5243_Applied_Data_Science/Spring2019-Proj1-zhengfei0908/doc/NMF.html'

In [144]:
from IPython.core.display import HTML
HTML('NMF.html')

In [30]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(hm)
tf_feature_names = tf_vectorizer.get_feature_names()
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
feel life family like moment win trip complete team close
Topic 1:
really job want place thing bring beautiful smile weather offer
Topic 2:
eat favorite spend food video free cook hear meal minute
Topic 3:
able dog event look saw little people try know way
Topic 4:
come start tell leave vacation plan away summer actually rain
Topic 5:
home felt wife child lose date hug speak nephew weight
Topic 6:
time long love say recently father bike bed excite room
Topic 7:
friend enjoy visit talk best house meet sister year phone
Topic 8:
nice car pay think purchase drive hit catch happen extra
Topic 9:
watch receive baby gift need mail send special favorite study
Topic 10:
new buy husband girl pass online shop wedding sale cute
Topic 11:
play game brother fun couple promotion niece law hike anniversary
Topic 12:
happiness weekend cat class end high turn state person drink
Topic 13:
finally old night hour work sleep project restaurant company dad
Topic 14:
son school morning help walk har

In [36]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


In [37]:
path = get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 600893
total chars: 57


In [None]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

### Visualization

In [64]:
import pyLDAvis
pyLDAvis.enable_notebook()

In [65]:
topic_term_dist = np.array([[0.5,0,0.5],[0.2,0.3,0.5]])
doc_topic_dists = np.array([[0.1,0.9],[0.8,0.2]])
doc_length = np.array([10,8])
vocab = np.array(['a','b','c'])
term_frequency = np.array([0.3,0.3,0.4])

In [66]:
prepare_data = pyLDAvis.prepare(topic_term_dists=topic_term_dist, doc_topic_dists=doc_topic_dists,
                               doc_lengths=doc_length, vocab=vocab, term_frequency=term_frequency, mds='tsne')

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [67]:
pyLDAvis.display(prepare_data)