In [17]:
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_lg
import nltk
from nltk.corpus import stopwords
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn  
from gensim import corpora, models, similarities

nlp= spacy.load("en_core_web_lg")
stop_list = set(stopwords.words('english')) 

nlp.Defaults.stop_words.update(stop_list)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

read_data = pd.read_csv('data.csv')
doc = read_data['val']

doc_list = []
for d in doc:
    pr = nlp(d)
    doc_list.append(' '.join(pr))

In [18]:
n_samples = 2000
n_features = 1000
n_components = 5
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


data_samples = doc_list
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(data_samples)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(data_samples)

#NMF Frobenius
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

#NMFKullback-Leibler
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

#LDA
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 0.000s.
Extracting tf-idf features for NMF...
done in 0.003s.
Extracting tf features for LDA...
done in 0.002s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.022s.

Topics in NMF model (Frobenius norm):
Topic #0: number multiply add digit subtract double single divide total able need space operation order division word door finding foot footage
Topic #1: multiplication skill determine double digit good geometry fraction formula footage foot finding equation door division divide word help decipher compute
Topic #2: subtraction skill digit length width area word finding division door double equation foot footage formula fraction geometry divide decipher determine
Topic #3: addition skill word foot division door double equation finding footage digit formula fraction geometry good height divide determine know decipher
Topic #4: area subtract door know rectangle determine wall total formula finding pe