In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [None]:
# Removing stop words before calculating TF-IDF using scikitlearn's set of stop words
from sklearn.feature_extraction import text
import nltk
from nltk.corpus import stopwords

myStopWords = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english')))

print(myStopWords)

In [None]:
# without lemmatization
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(max_df=0.85, stop_words=myStopWords)
# vectors =gmail.com vectorizer.fit_transform(newsgroups_train.data)

In [None]:
# with lemmatization
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in CountVectorizer().build_tokenizer()(doc)]

vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=0.85, stop_words=myStopWords, ngram_range=(1, 2))
vectors = vectorizer.fit_transform(newsgroups_train.data)

transformer = TfidfTransformer()
vectors = transformer.fit_transform(vectors)

In [None]:
# # from pandas import DataFrame
# # print(DataFrame(vectors.A, columns=vectorizer.get_feature_names()).to_string())

# # print(vectorizer.vocabulary_)
# # print(len(vectorizer.vocabulary_))
# cnt = 0
# for k,v in vectorizer.vocabulary_.items():
#     print(k+": "+str(v))
#     cnt += 1
#     if cnt == 100:
#         break


In [None]:
# Using Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# without lemmatization
#vectors_test = vectorizer.transform(newsgroups_test.data)

# with lemmatization
vectors_test = vectorizer.transform(newsgroups_test.data)
vectors_test = transformer.transform(vectors_test)

clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

In [None]:
import numpy as np
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-15:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))
        print()
show_top10(clf, vectorizer, newsgroups_train.target_names)

In [None]:
# Using gensim LDA for topic modelling
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel

gensimCorpus = Sparse2Corpus(vectors)
id2word_newsgroups = dict((v, k) for k, v in vectorizer.vocabulary_.items())
lda = LdaModel(corpus=gensimCorpus, num_topics=20, id2word=id2word_newsgroups)

In [None]:
lda.print_topics(20)