In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import gensim
from gensim import corpora
import numpy as np
import re
from nltk.corpus import stopwords
from pprint import pprint

In [23]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [24]:
with open('4_en.txt', encoding='utf-8') as data:
    docs = data.read().splitlines()
docss = []
for i in range(len(docs)):
    if docs[i] != '':
        docss.append(docs[i])

In [25]:
n_topics = 20
print('Text collection size and median length in symbols:')
print(len(docss), np.median([len(d) for d in docss]))

Text collection size and median length in symbols:
4276 164.0


In [26]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                   ngram_range=(2,2),
                                   min_df=5,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(docss)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print()




In [27]:
nmf = NMF(n_components=n_topics)
nmf_doc_topic = nmf.fit_transform(tfidf)
print('NMF doc-topic shape:', nmf_doc_topic.shape)

NMF doc-topic shape: (4276, 20)


In [31]:
# LDA on raw words counts
tf_vectorizer = CountVectorizer(max_df=0.8,
                                ngram_range=(2,2),
                                min_df=5,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(docss)
tf_feature_names = tf_vectorizer.get_feature_names()

In [32]:
lda = LatentDirichletAllocation(n_topics=20)
lda_doc_topic = lda.fit_transform(tf)
print('LDA doc-topic shape:', lda_doc_topic.shape)



LDA doc-topic shape: (4276, 20)


In [33]:
no_top_words = 10
print('\nNMF top terms:')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\nLDA top terms:')
display_topics(lda, tf_feature_names, no_top_words)


NMF top terms:
Topic 0:
don know, know doing, know don, know going, know ve, know happening, really don, know think, know know, just don
Topic 1:
united states, president obama, states going, american people, president united, mexican government, iran deal, donald trump, september 11th, trillions dollars
Topic 2:
hillary clinton, radical islam, clinton campaign, radical islamic, say words, burden hillary, clinton tell, secretary state, islamic terrorism, words radical
Topic 3:
don want, want money, said don, want tell, want know, want don, want want, want people, don need, money don
Topic 4:
make america, america great, great going, right thing, ladies gentlemen, doing know, months ago, want make, white house, going crazy
Topic 5:
ve seen, illegal immigration, seen like, like ve, ships ve, los angeles, know ve, mean ve, common core, happened country
Topic 6:
ve got, got smart, say ve, got ve, got stop, politically correct, nice guy, mean ve, got make, got know
Topic 7:
going happen, h

In [34]:
# gensim LDA - может занять время

stopWords = set(stopwords.words('english'))
tok_collection = []
for d in docss:
    tok_collection.append([w for w in re.split('[\W]+', d) if len(w) > 3 and w not in stopWords])


dictionary = corpora.Dictionary(tok_collection)

corpus = [dictionary.doc2bow(text) for text in tok_collection]

ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=n_topics,
                                           id2word=dictionary)

pprint(ldamodel.print_topics())

[(0,
  '0.022*"want" + 0.015*"trade" + 0.013*"know" + 0.013*"even" + 0.012*"people" '
  '+ 0.012*"fewer" + 0.011*"actually" + 0.010*"easy" + 0.010*"costs" + '
  '0.010*"said"'),
 (1,
  '0.036*"going" + 0.030*"They" + 0.025*"trade" + 0.019*"want" + '
  '0.017*"afraid" + 0.015*"people" + 0.014*"cities" + 0.014*"talk" + '
  '0.013*"report" + 0.012*"jobs"'),
 (2,
  '0.025*"THEY" + 0.021*"TRUMP" + 0.020*"HAVE" + 0.018*"THAT" + 0.018*"GOING" '
  '+ 0.011*"PEOPLE" + 0.010*"WHAT" + 0.008*"Indiana" + 0.008*"people" + '
  '0.008*"SAID"'),
 (3,
  '0.032*"know" + 0.020*"great" + 0.020*"evangelicals" + 0.019*"That" + '
  '0.018*"rate" + 0.018*"Where" + 0.016*"unemployment" + 0.014*"said" + '
  '0.014*"Sharon" + 0.012*"religion"'),
 (4,
  '0.037*"THEY" + 0.033*"HAVE" + 0.030*"GOING" + 0.029*"THAT" + 0.017*"PEOPLE" '
  '+ 0.014*"WHAT" + 0.010*"SAID" + 0.010*"THIS" + 0.010*"WITH" + 0.010*"WANT"'),
 (5,
  '0.022*"people" + 0.021*"love" + 0.019*"know" + 0.016*"They" + '
  '0.014*"dollars" + 0.014*"going