In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import lda

from data_analysis.topic_tokenizer import TopicTokenizer
from data_analysis._util import make_document_term_matrix


file_ = 'twitter.sqlite3'
tweets = pd.read_sql_table('tweets', 'sqlite:///{}'.format(file_))
tweets = tweets[tweets.is_retweet == False]

In [None]:
tokenizer = TopicTokenizer()
token_list = [tokenizer.tokenize(t) for t in tweets.tweet[0:2000]]

In [None]:
tokenizer.tokenize('@benhoff https://www.googel.com')

In [None]:
token_list = [t for t in token_list if t]
bool([])

In [None]:
document_matrix, vocabulary = make_document_term_matrix(token_list)

In [None]:
model = lda.LDA(n_topics=120, n_iter=1500, random_state=1)
model.fit(document_matrix)

In [None]:
vocab_array = np.array(list(vocabulary.keys()))
for i, topic_dist in enumerate(model.topic_word_):
    temp = np.argpartition(-topic_dist, 8)
    result = temp[:8]
    word_result = []
    for r in result:
        word_result.append(vocab_array[r])
    print('Topic {}: {}'.format(i, ' '.join(word_result)))

In [None]:
classified_data = model.fit_transform(document_matrix)

In [None]:
print(classified_data)

In [None]:

# classified_data = classified_data.argmax(1)
from collections import Counter
count = Counter(classified_data)
keys_counts = np.array(count.most_common())
keys = keys_counts[:, 0]
counts = keys_counts[:, 1]
print(keys, counts)

In [None]:
from matplotlib import pyplot as plt
plt.barh(np.arange(len(counts), 0, -1), counts)

In [None]:
def get_vocabulary_helper(topic_numbers, number=5):
    vocab = np.array(list(vocabulary.keys()))
    topic_models = model.topic_word_
    result = []
    for topic_number in topic_numbers:
        words = vocab[np.argsort(topic_models[topic_number])][:-(number+1):-1]
        result.append(words)

    return result

In [None]:
word_list = get_vocabulary_helper(keys)

In [None]:
fig = plt.figure()
axis = fig.add_subplot(111)
X, Y = fig.get_dpi() * fig.get_size_inches()
h = Y / (20)

for row, words in enumerate(word_list):
    y = Y - (row * h) - h

    axis.text(0.3, y, ' '.join(words), fontsize=(h * 0.8),
            horizontalalignment='left',
            verticalalignment='center')

axis.set_ylim(0, Y)
axis.set_axis_off()

In [None]:
fig = plt.figure()
axis = fig.add_subplot(111)
X, Y = fig.get_dpi() * fig.get_size_inches()
num_print = 10
h = Y / num_print

y_s = []
for row, words in enumerate(word_list[:num_print]):
    y = Y - (row * h) - h
    y_s.append(y)
    axis.text(1, y, ' '.join(words), fontsize=(h*.5),
            horizontalalignment='left',
            verticalalignment='center',
             color='gold')
y_s[-1] = 0
axis.set_ylim(-20, Y)
#axis.set_xlin(0, )
axis.barh(y_s, counts[:num_print], height=30, color='midnightblue')
axis.get_yaxis().set_visible(False)