In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import json
import urllib.request

## Online 
solr_root = 'http://docker:8983/solr/crawldb/select?fl=url,extracted_text,content-type_t_hd&'
solr_query = 'q=rows=200&start=1&wt=json'
solr_url = solr_root + solr_query
req = urllib.request.Request(solr_url)
# parsing response
r = urllib.request.urlopen(req).read()
json_response = json.loads(r.decode('utf-8'))
solr_documents = json_response['response']['docs']

print("Processing {0} documents. \n".format(len(solr_documents)))

* **Tokenization & Stop Words & Wordnet Lemmatizer** 

In [None]:
## we need a tokenizer
tokenizer = RegexpTokenizer(r'\w+')
## we need stemer
stemmer = WordNetLemmatizer()
## our custom stop words
my_stop_words = {
                    'http', 'www', 'edu', 'org', 'com', 'rda', 'data', 'researcher', 'event', 'service',
                    'group', 'research', 'community', 'use', 'work', 'member', 'case', 'science',
                    'meeting', 'organisational', 'news', 'plenary', 'recommendation', 'project', 'standard',
                    'statement', 'school', 'university', 'membership', 'output', '2017', 'brokering',
                    'stakeholder', 'repository', 'user', 'citation', 'chair', 'framework', 'information',
                    'metadata', 'content', 'sharing', 'pid', 'type', 'record'
                }
stop_words = my_stop_words.union(ENGLISH_STOP_WORDS)
# document list will contain our corpus after cleaning it.
document_list = []
# pairs is a list of the urls and the size of their content
pairs = []
# just the documents urls
urls = []

# There must be an error with the stop words or the scikit algos are doing something off, pid should not be present
# in the corpus and yet it shows up as a topic.

for item in solr_documents:
    # If we apply NER it should be the first step.
    # We tokenize words and lower case them(for now)
    #tokens = tokenizer.tokenize((item['content'][0]))
    tokens = tokenizer.tokenize((item['extracted_text']))
    # We lematize (stemming)
    stemmed_tokens = [stemmer.lemmatize(i) for i in tokens]
    # If the token is not in our stop words and the length is >2 and <20 we add it to the cleaned document
    document = [i for i in stemmed_tokens if i not in stop_words and (len(i) > 2 and len(i) < 25)]
    # To debug uncomment the next line
    #print("{0}\n Document size before stop words: {1}, after: {2} ".format(item['url'],len(stemmed_tokens),len(document)))
    document_list.append(document)
    pairs.append((item['url'],len(document)))
    urls.append(item['url'])
    
# Aux print function for topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" " + " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])) 
                

* **Run NMF and LDA on the corpus** 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_features = 1000 # Number of representative tokens 
no_topics = 10 # can be changed it according to data and need
minimum_likelihood = 0.10 # If a topic above this % is contained in a document it will be considered as present

documents = [' '.join(doc) for doc in document_list]

# Make sure to have enough data on Solr index in ordrer to avoid Invalid Index error 
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# Run NMF
nmf_components = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_documents = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit_transform(tfidf)

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
# Run LDA
lda_components = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_documents = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit_transform(tf)


* **Building Topic Document Matrix for LDA and NMF**

In [None]:
nmf_matrix = []
lda_matrix = []

lda_topic_documents = [[] for i in range(no_topics)]
nmf_topic_documents = [[] for i in range(no_topics)]

for idx, doc in enumerate(lda_documents):
    doc_topics = {'url': urls[idx],
                       'topics' :[(float("{0:.4f}".format(doc[i])),i)
                                  for i in doc.argsort() if doc[i]> minimum_likelihood]}
    for topic in doc_topics['topics']:
         lda_topic_documents[topic[1]].append((urls[idx],topic[0]))
    lda_matrix.append(doc_topics)

for idx, doc in enumerate(nmf_documents):
    doc_topics = {'url': urls[idx],
                       'topics' :[(float("{0:.4f}".format(doc[i])),i)
                                  for i in doc.argsort() if doc[i]> minimum_likelihood]}
    
    for topic in doc_topics['topics']:
         nmf_topic_documents[topic[1]].append((urls[idx],topic[0]))
    nmf_matrix.append(doc_topics)

Now we have 2 matrices for each method. For LDA `lda_topic_documents` contains a list of all the topics and each row has the documents that the model predicted they belonged to. The second matrix `lda_matrix` is the inverse, a list of the documents and the topics each document talks about with their probabilities. 

In [None]:
# Let's print the topics created by NMF
print("\nNMF TOPICS\n ")
display_topics(nmf_components, tfidf_feature_names, no_top_words=8)

# Now we'll print the topics created by LDA
print("\nLDA TOPICS\n ")
display_topics(nmf_components, tf_feature_names, no_top_words=8)


* **We can also see what the models predicted for each topic**

In [None]:
# Print first topic and their documents:
lda_topic_index = 1
nmf_topic_index = 9

# can be changed it according to data and need, cannot exceed the number of docs processed
document_index = 1

def getkey(doc):
    return doc[1]

print("NMF Topic {} Contains: \n".format(nmf_topic_index))
for doc in sorted(nmf_topic_documents[nmf_topic_index],key=getkey):
    print (doc)

# Print a document and their topics:
print("\nDocument: {0}".format(urls[document_index]))
print(" Topics According to NMF: {0}".format(nmf_matrix[document_index]['topics']))
print(" Topics According to LDA: {0}".format(lda_matrix[document_index]['topics']))