In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import csv
from sklearn import svm
import sklearn.feature_extraction.text
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
from sklearn import preprocessing
from sklearn.decomposition import NMF, LatentDirichletAllocation
import re
import string
import graphviz
import sklearn.datasets

plt.ioff()
pd.set_option('display.max_columns', None)

In [2]:
text_dataset = sklearn.datasets.fetch_20newsgroups(shuffle=True, remove=('headers', 'footers', 'quotes'))
text_data = text_dataset.data
print('Number of Documents: {}'.format(len(text_data)))

Number of Documents: 11314


In [3]:
text_data[50]

u'I am trying to write an image display program that uses\nthe MIT shared memory extension.  The shared memory segment\ngets allocated and attached to the process with no problem.\nBut the program crashes at the first call to XShmPutImage,\nwith the following message:\n\nX Error of failed request:  BadShmSeg (invalid shared segment parameter)\n  Major opcode of failed request:  133 (MIT-SHM)\n  Minor opcode of failed request:  3 (X_ShmPutImage)\n  Segment id in failed request 0x0\n  Serial number of failed request:  741\n  Current serial number in output stream:  742\n\nLike I said, I did error checking on all the calls to shmget\nand shmat that are necessary to create the shared memory\nsegment, as well as checking XShmAttach.  There are no\nproblems.\n\nIf anybody has had the same problem or has used MIT-SHM without\nhaving the same problem, please let me know.\n\nBy the way, I am running OpenWindows 3.0 on a Sun Sparc2.'

In [4]:
ngram_size = 1
vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size, ngram_size), min_df=0.01, max_df=0.9,  stop_words='english')
vect.fit(text_data)
vocab = vect.get_feature_names()
bow = vect.transform(text_data)

In [5]:
def print_topics(lda_model, feature_names, n_top_words):
    for topic_index, topic in enumerate(lda_model.components_):
        words =  " , ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_desc = "Topic #{}: {}".format(topic_index, words)
        print(topic_desc)
        print

In [6]:
n_components = 15
n_top_words = 25
lda = LatentDirichletAllocation(n_components=n_components, max_iter=20,
                                learning_method='online',
                                learning_offset=5.,
                                random_state=0)
lda.fit(bow)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=5.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_components=15, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [7]:
print("Topics in LDA model:")
tf_feature_names = vect.get_feature_names()
print_topics(lda, tf_feature_names, n_top_words)

Topics in LDA model:
Topic #0: states , american , israel , jews , war , national , air , world , israeli , health , united , press , anti , members , april , medical , force , white , land , washington , house , america , south , action , jewish

Topic #1: said , did , didn , time , years , right , went , home , got , came , left , just , ago , told , know , going , took , let , says , started , saw , remember , year , later , say

Topic #2: question , true , does , answer , different , problem , point , case , use , simply , used , work , correct , read , argument , non , note , value , questions , water , means , cause , rules , way , time

Topic #3: file , windows , use , program , files , software , window , ftp , version , dos , available , code , image , pc , graphics , server , mail , pub , info , application , using , display , thanks , set , format

Topic #4: people , mr , state , president , person , day , government , country , books , children , book , rights , women , day

In [9]:
test_document = text_data[10]
test_document_bow = vect.transform([test_document])
topic_predictions = lda.transform(test_document_bow)[0]
print("Document: '{}'\n".format(test_document))
print("Topic Distribution:\n")
print(list(enumerate(topic_predictions)))

Document: 'I have a line on a Ducati 900GTS 1978 model with 17k on the clock.  Runs
very well, paint is the bronze/brown/orange faded out, leaks a bit of oil
and pops out of 1st with hard accel.  The shop will fix trans and oil 
leak.  They sold the bike to the 1 and only owner.  They want $3495, and
I am thinking more like $3K.  Any opinions out there?  Please email me.
Thanks.  It would be a nice stable mate to the Beemer.  Then I'll get
a jap bike and call myself Axis Motors!

-- 
-----------------------------------------------------------------------
"Tuba" (Irwin)      "I honk therefore I am"     CompuTrac-Richardson,Tx
irwin@cmptrc.lonestar.org    DoD #0826          (R75/6)'

Topic Distribution:

[(0, 0.0035087719312709448), (1, 0.003508773820594445), (2, 0.0035087761618828495), (3, 0.0035087753019973512), (4, 0.0035087725307880262), (5, 0.0035087760903129359), (6, 0.003508776751601177), (7, 0.80097593994192495), (8, 0.0035087747573197487), (9, 0.0035087746699556275), (10, 0.0983