In [1]:
%load_ext autoreload
%autoreload 2

In [55]:
import nlp
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

corpus, labels = nlp.load_all('articles_db.db')
text = [article[1] for article in corpus]

In [56]:
def print_top_words(model, feature_names, n_top_words):
    print('Topics in:')
    print(str(model))
    for topic_idx, topic in enumerate(model.components_):
        message = f"----------  Topic #{topic_idx}:  ----------\n" 
        message += " ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        )
        print(message)
    print()

In [59]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

#print("\nTopics in NMF model (Frobenius norm):")
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topics in:
NMF(alpha=0.1, beta_loss='frobenius', init=None, l1_ratio=0.5, max_iter=200,
    n_components=10, random_state=1, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)
----------  Topic #0:  ----------
deal brexit lawmakers parliament britain vote said european party extension minister delay government house prime conservative leaders ireland commons agreement
----------  Topic #1:  ----------
image click nbcnews view com http newscms jpg www https text brexit storyline theresa media2 media4 referendum britain minister prime
----------  Topic #2:  ----------
obama influence britons challenges published president visit today newspaper britain world united global allies war london europe states trip offering
----------  Topic #3:  ----------
leadsom cameron party minister johnson prime secretary conservative gove leader country job leadership said britain members announced home race new
----------  Topic #4:  ----------
trump said visit president interview sun conference trip

In [61]:
test_article = text[0]

In [72]:
lda.

-319809.6938591505

In [73]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(text)

lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=1)
lda.fit(tf)

#print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topics in:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=1, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)
----------  Topic #0:  ----------
brexit deal britain eu european said parliament british union minister prime lawmakers leaders vote party agreement leave theresa uk delay
----------  Topic #1:  ----------
deal vote said mps brexit eu european party uk labour referendum union result scotland street tweeted remain parliament united london
----------  Topic #2:  ----------
britain party brexit minister said election leave corbyn labour prime people british leader eu vote voters june union new europe

In [75]:
from hdbscan import HDBSCAN



In [76]:
dbs = HDBSCAN(min_cluster_size=2)

In [98]:
from collections import Counter

In [99]:
dbs_labels = dbs.fit_predict(tf)

Counter(dbs_labels)

Counter({-1: 162, 3: 3, 5: 3, 4: 2, 6: 20, 7: 2, 2: 2, 0: 2, 1: 2})

In [100]:
dbs.

ValueError: setting an array element with a sequence.

In [None]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)