In [27]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups, 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import reuters
import nltk
from gensim import models
import gensim
import lda
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

## Load dataset

### sklearn

In [3]:
dataset_sklearn = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [4]:
len(dataset_sklearn.data)

11314

In [5]:
tfidf_vectorizer_sklearn = TfidfVectorizer(max_df=0.95, min_df=0.01, stop_words='english')
tfidf_sklearn = tfidf_vectorizer_sklearn.fit_transform(dataset_sklearn.data)

In [6]:
tf_vectorizer_sklearn = CountVectorizer(max_df=0.95, min_df=0.01, stop_words='english')
tf_sklearn = tf_vectorizer_sklearn.fit_transform(dataset_sklearn.data)

In [7]:
word2id_sklearn = tf_vectorizer_sklearn.vocabulary_

In [8]:
tf_sklearn

<11314x1142 sparse matrix of type '<class 'numpy.int64'>'
	with 313237 stored elements in Compressed Sparse Row format>

In [9]:
id2word_sklearn = {}
for key in word2id_sklearn:
    id2word_sklearn[word2id_sklearn[key]] = key

### nltk

In [40]:
documents = reuters.fileids()

In [41]:
documents

['test/14826',
 'test/14828',
 'test/14829',
 'test/14832',
 'test/14833',
 'test/14839',
 'test/14840',
 'test/14841',
 'test/14842',
 'test/14843',
 'test/14844',
 'test/14849',
 'test/14852',
 'test/14854',
 'test/14858',
 'test/14859',
 'test/14860',
 'test/14861',
 'test/14862',
 'test/14863',
 'test/14865',
 'test/14867',
 'test/14872',
 'test/14873',
 'test/14875',
 'test/14876',
 'test/14877',
 'test/14881',
 'test/14882',
 'test/14885',
 'test/14886',
 'test/14888',
 'test/14890',
 'test/14891',
 'test/14892',
 'test/14899',
 'test/14900',
 'test/14903',
 'test/14904',
 'test/14907',
 'test/14909',
 'test/14911',
 'test/14912',
 'test/14913',
 'test/14918',
 'test/14919',
 'test/14921',
 'test/14922',
 'test/14923',
 'test/14926',
 'test/14928',
 'test/14930',
 'test/14931',
 'test/14932',
 'test/14933',
 'test/14934',
 'test/14941',
 'test/14943',
 'test/14949',
 'test/14951',
 'test/14954',
 'test/14957',
 'test/14958',
 'test/14959',
 'test/14960',
 'test/14962',
 'test/149

In [44]:
reuters.open('test/14862')

AssertionError: 

In [46]:
reuters.words(categories=['barley', 'corn'])

AssertionError: 

In [12]:
dataset_nltk = []
for document in documents:
    dataset_nltk.append(reuters.open(document).readlines())

AssertionError: 

In [11]:
dataset_nltk_processed = []
for dataset in dataset_nltk:
    dataset_nltk_processed.append("".join(dataset))

In [12]:
tfidf_vectorizer_nltk = TfidfVectorizer(max_df=0.95, min_df=0.01, stop_words='english')
tf_nltk = tfidf_vectorizer_nltk.fit_transform(dataset_nltk_processed)

In [13]:
tf_vectorizer_nltk = CountVectorizer(max_df=0.95, min_df=0.01, stop_words='english')
tf_nltk = tf_vectorizer_nltk.fit_transform(dataset_nltk_processed)

In [14]:
word2id_nltk = tf_vectorizer_nltk.vocabulary_

In [15]:
id2word_nltk = {}
for key in word2id_nltk:
    id2word_nltk[word2id_nltk[key]] = key

In [16]:
id2word_nltk[900]

'sources'

## LDA

### gensim

In [21]:
corpus_sklearn = gensim.matutils.Sparse2Corpus(tfidf_sklearn)

In [18]:
corpus_nltk = gensim.matutils.Sparse2Corpus(tf_nltk)

In [22]:
lda_gensim = models.ldamodel.LdaModel(corpus=corpus_sklearn, id2word=id2word_sklearn, num_topics=100)

IndexError: index 1339 is out of bounds for axis 1 with size 1142

In [23]:
lda_gensim_nltk = models.ldamodel.LdaModel(corpus=corpus_nltk, id2word=id2word_nltk, num_topics=100, update_every=1, chunksize=10000, passes=1)

IndexError: index 1041 is out of bounds for axis 1 with size 1017

In [124]:
lda_gensim_nltk.print_topics(20)

[(46,
  '0.016*"comsat" + 0.014*"disdmutase" + 0.014*"dated" + 0.013*"diverting" + 0.012*"0635" + 0.011*"cabbage" + 0.010*"describes" + 0.010*"064" + 0.007*"amd" + 0.006*"edwin"'),
 (70,
  '0.013*"304" + 0.009*"1040" + 0.009*"bbusx" + 0.008*"calmness" + 0.008*"coop" + 0.007*"emirates" + 0.006*"estates" + 0.006*"ccimf" + 0.006*"drug" + 0.006*"contel"'),
 (21,
  '0.026*"acted" + 0.021*"desire" + 0.015*"bossa" + 0.014*"avaialble" + 0.013*"avaition" + 0.013*"coalition" + 0.011*"abundance" + 0.011*"acquiring" + 0.008*"admac" + 0.008*"bancorporation"'),
 (37,
  '0.011*"chpk" + 0.008*"amendend" + 0.008*"boycotts" + 0.008*"bedell" + 0.008*"constitutional" + 0.007*"beach" + 0.007*"beshir" + 0.007*"donated" + 0.007*"envases" + 0.006*"bulletin"'),
 (8,
  '0.023*"303" + 0.021*"auckland" + 0.016*"dragging" + 0.015*"appreciated" + 0.014*"167a" + 0.012*"discrepancies" + 0.012*"308" + 0.008*"coffees" + 0.008*"743" + 0.007*"entitlements"'),
 (62,
  '0.013*"bopa" + 0.013*"balzec" + 0.010*"101" + 0.007*"

In [125]:
lda_gensim_nltk.print_topic(50)

'0.009*"dwg" + 0.007*"colorants" + 0.007*"ard" + 0.007*"artificially" + 0.006*"bearishness" + 0.006*"beck" + 0.006*"beet" + 0.006*"colo" + 0.006*"1315" + 0.006*"1951"'

### pythonhosted lda

In [24]:
X = lda.datasets.load_reuters()

In [25]:
X_vocab = lda.datasets.load_reuters_vocab()
X_titles = lda.datasets.load_reuters_titles()

In [26]:
lda_python = lda.LDA(n_topics=20, n_iter=1500, random_state=1)

In [27]:
lda_python.fit(X)

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1051748
INFO:lda:<10> log likelihood: -719800
INFO:lda:<20> log likelihood: -699115
INFO:lda:<30> log likelihood: -689370
INFO:lda:<40> log likelihood: -684918
INFO:lda:<50> log likelihood: -681322
INFO:lda:<60> log likelihood: -678979
INFO:lda:<70> log likelihood: -676598
INFO:lda:<80> log likelihood: -675383
INFO:lda:<90> log likelihood: -673316
INFO:lda:<100> log likelihood: -672761
INFO:lda:<110> log likelihood: -671320
INFO:lda:<120> log likelihood: -669744
INFO:lda:<130> log likelihood: -669292
INFO:lda:<140> log likelihood: -667940
INFO:lda:<150> log likelihood: -668038
INFO:lda:<160> log likelihood: -667429
INFO:lda:<170> log likelihood: -666475
INFO:lda:<180> log likelihood: -665562
INFO:lda:<190> log likelihood: -664920
INFO:lda:<200> log likelihood: -664979
INFO:lda:<210> log likelihood: -664722
INFO:lda:<220> log likelihood: -

<lda.lda.LDA at 0x7f3a5ce10828>

In [28]:
topic_word = lda_python.topic_word_

In [29]:
n_top_words = 8

In [30]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(X_vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: british churchill sale million major letters west
Topic 1: church government political country state people party
Topic 2: elvis king fans presley life concert young
Topic 3: yeltsin russian russia president kremlin moscow michael
Topic 4: pope vatican paul john surgery hospital pontiff
Topic 5: family funeral police miami versace cunanan city
Topic 6: simpson former years court president wife south
Topic 7: order mother successor election nuns church nirmala
Topic 8: charles prince diana royal king queen parker
Topic 9: film french france against bardot paris poster
Topic 10: germany german war nazi letter christian book
Topic 11: east peace prize award timor quebec belo
Topic 12: n't life show told very love television
Topic 13: years year time last church world people
Topic 14: mother teresa heart calcutta charity nun hospital
Topic 15: city salonika capital buddhist cultural vietnam byzantine
Topic 16: music tour opera singer israel people film
Topic 17: church catholic be

### sklearn

In [12]:
lda_sklearn_sklearn = LatentDirichletAllocation(n_topics=100, doc_topic_prior=0.01, topic_word_prior=0.1)

In [13]:
lda_sklearn_sklearn.fit(tf_sklearn)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.01,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=100, perp_tol=0.1,
             random_state=None, topic_word_prior=0.1,
             total_samples=1000000.0, verbose=0)

In [14]:
lda_sklearn_sklearn_transformed = lda_sklearn_sklearn.transform(tf_sklearn)

In [15]:
lda_sklearn_sklearn_transformed.shape

(11314, 100)

In [16]:
lda_sklearn_sklearn.components_.shape

(100, 1142)

In [17]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" %topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:- n_top_words - 1:-1]]))
    print()

In [18]:
def print_top_topics(transformed, model, feature_names, n_top_words, n_top_topics):
    for doc_idx, doc in enumerate(transformed):
        print("Document #%d" %doc_idx)
        print(doc)
        for i in doc.argsort()[: - n_top_topics - 1 : -1]:
            topic = model.components_[i]
            print("Topic #%d:" %i)
            print(" ".join(feature_names[j] for j in topic.argsort()[: - n_top_words - 1 : -1]))
        print()

In [35]:
def cosine_similarity_topics(transformed):
    l = len(transformed)
    i = 0
    cosine_sim = []
    k = 0
    while i < l:
        j = i+1
        while j < l:
            cosine_sim.append(cosine_similarity(transformed[i].reshape(1, -1), transformed[j].reshape(1, -1)))
            j += 1
        i += 1

    return cosine_sim

In [19]:
print_top_words(lda_sklearn_sklearn, tf_vectorizer_sklearn.get_feature_names(), 20)

Topic #0:
limited community provides ask killed human god trying fair sure house interface questions makes able 29 vs likely usual supported
Topic #1:
drive disk hard drives tape floppy cable internal supply power external mb switch access controller use 25 format new original
Topic #2:
evidence claim non possible argument true claims doubt clear certain natural arguments exists prove conclusion proof valid cause heard truth
Topic #3:
read error reading normal yes appears ed wonder save normally michael fix write rest class middle necessary return needed exact
Topic #4:
fbi groups individual prove month final tried problems decided wonder obvious driver likely advice chance friend applications best attempt net
Topic #5:
memory includes thanks sounds make sale machine interested com respond offer 16 digital know contact bike effect like try similar
Topic #6:
people say point think did word don agree mean just said fact different question believe saying words says make sense
Topic #7:
al

In [37]:
sen1 = ["There is a sale going on at a shop near my house", "Dan is a very good writer"]
lda_sklearn_sklearn_transformed_sen1 = lda_sklearn_sklearn.transform(tf_vectorizer_sklearn.transform(sen1))

In [38]:
print_top_topics(lda_sklearn_sklearn_transformed_sen1, lda_sklearn_sklearn, tf_vectorizer_sklearn.get_feature_names(), 20, 5)

Document #0
[ 0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.20200003  0.002       0.002       0.002
  0.40200003  0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.20199994  0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.002       0.002       0.002       0.002
  0.002       0.002       0.002       0.

In [39]:
cosine_similarity_topics(lda_sklearn_sklearn_transformed_sen1)

[array([[ 0.02395653]])]