In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [4]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [5]:
# Removing stop words before calculating TF-IDF using scikitlearn's set of stop words
from sklearn.feature_extraction import text
import nltk
from nltk.corpus import stopwords

myStopWords = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))

In [25]:
dataset =  fetch_20newsgroups(shuffle=True,subset='all', random_state=1,remove=('headers', 'footers', 'quotes'))
documents=dataset.data
categories = dataset.target

In [26]:
print (documents[0])
print (len(documents))


18846


In [None]:
n_features = 5000
# with lemmatization
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in CountVectorizer().build_tokenizer()(doc)]

vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=0.85, max_features=n_features, stop_words=myStopWords)
vectorizedData = vectorizer.fit_transform(documents)
transformer = TfidfTransformer()
transformedData = transformer.fit_transform(vectorizedData)

In [None]:
no_topics = 20

In [None]:
# NMF is able to use tf-idf
tfidf = transformedData
tfidf_feature_names = vectorizer.get_feature_names()

In [None]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf = vectorizedData
tf_feature_names = vectorizer.get_feature_names()

In [None]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [None]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='batch', learning_offset=50.,random_state=0).fit(tf)

In [None]:
no_top_words = 15
display_topics(nmf, tfidf_feature_names, no_top_words)
print()
display_topics(lda, tf_feature_names, no_top_words)

In [None]:
test_vectorizer = CountVectorizer(vocabulary=tf_feature_names, tokenizer=LemmaTokenizer(), stop_words=myStopWords)
test_vectors = test_vectorizer.fit_transform(documents)
predict = lda.transform(test_vectors)
print(predict.shape)

for i in range(5):
    print(predict[i])

In [None]:
#method 1 
size_training= len(documents)
train_output=[]
# print (train_output[0])
sz=0
for i in categories:
    g_list=[0]*20
    g_list[i]=1
    train_output.append(g_list)
    
print (train_output[0:5])

In [None]:
test_dataset =  fetch_20newsgroups(shuffle=True,subset='test', random_state=1,remove=('headers', 'footers', 'quotes'))
test_doc=test_dataset.data
test_cat= test_dataset.target

In [None]:
test_vectorizedData = vectorizer.fit_transform(test_doc)
test_transformedData = transformer.fit_transform(vectorizedData)

In [None]:
test_tfidf = test_transformedData
tfidf_feature_names = vectorizer.get_feature_names()
test_tf = test_vectorizedData
tf_feature_names = vectorizer.get_feature_names()
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(test_tfidf)
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='batch', learning_offset=50.,random_state=0).fit(test_tf)
no_top_words = 15
display_topics(nmf, tfidf_feature_names, no_top_words)
#print()
display_topics(lda, tf_feature_names, no_top_words)
test_vectorizer = CountVectorizer(vocabulary=tf_feature_names, tokenizer=LemmaTokenizer(), stop_words=myStopWords)
test_vectors = test_vectorizer.fit_transform(test_doc)
test_predict = lda.transform(test_vectors)
#print(predict.shape)

#for i in range(5):
#    print(predict[i])

In [1]:
#method 1 
test_output=[]
# print (train_output[0])
sz=0
for i in test_cat:
    g_list=[0]*20
    g_list[i]=1
    test_output.append(g_list)
    
print (test_output[0:5])

NameError: name 'test_cat' is not defined

In [None]:
# time to train dataset
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print (gnb.fit(predict,train_output).score(test_predict,test_output))
