In [None]:
# Cell 1: Import requirements

import urllib
import json
import string
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# NLP libraries
import spacy
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore, LdaModel

In [None]:
# Cell 2: Querying Solr and building our corpus out of the matching documents 

# terms = ['ice', 'climate'] to include only abstracts with specified terms
terms = ['*']
years = ['2018','2017']
entities = ['*']
sessions = ['NH']

parameters = '_'.join(years) + '_'.join(terms) + '_'.join(sessions)

# Return "page_size" documents with each Solr query until complete
page_size = 5000
cursorMark = '*'

solr_documents = []
solr_root = 'http://integration.pdi-solr.labs.nsidc.org/solr/egu/select?indent=on&'
more_results = True


if terms[0] != '*':
    terms_wirldcard = ['*' + t + '*' for t in terms]
else:
    terms_wirldcard = ['*']
    
if sessions[0] != '*':
    sessions_wirldcard = ['*' + s + '*' for s in sessions]
else:
    sessions_wirldcard = ['*']
    
if entities[0] != '*':
    entities_wirldcard = ['*' + e + '*' for e in entities]
else:
    entities_wirldcard = ['*']

terms_query = '%20OR%20abstract:'.join(terms_wirldcard)
years_query = '%20OR%20year:'.join(years)  
entities_query = '%20OR%20entities:'.join(entities_wirldcard)
sessions_query = '%20OR%20sessions:'.join(sessions_wirldcard)
query_string = 'q=(abstract:{})%20AND(year:{})' + \
                '%20AND%20(entities:{})%20AND%20(sessions:{})&wt=json&rows={}&cursorMark={}&sort=id+asc'
while (more_results):    
    solr_query = query_string.format(terms_query,
                                     years_query,
                                     entities_query,
                                     sessions_query,
                                     page_size,
                                     cursorMark)
    solr_url = solr_root + solr_query
    print('Querying: \n' + solr_url)
    req = urllib.request.Request(solr_url)
    # parsing response
    r = urllib.request.urlopen(req).read()
    json_response = json.loads(r.decode('utf-8'))
    solr_documents.extend(json_response['response']['docs'])
    nextCursorMark = json_response['nextCursorMark']
    if (nextCursorMark == cursorMark):
        more_results = False
        break
    else: 
        cursorMark = nextCursorMark

total_found = json_response['response']['numFound']
print("Processing {0} out of {1} total. \n".format(len(solr_documents), total_found))

In [None]:
# Cell 3, remove stop words and create an array of documents

import string

my_stop_words = {'et_al','change'}

def remove_stop_words(text):
    cleaned_test = [w for w in text if w not in my_stop_words]
    cleaned_test = [w for w in cleaned_test if len(w) > 2]
    return cleaned_test

document_list = []
# bigram corpus will contain an array of documents and their tokens, with bigram tokens included
bigram_corpus = []

for doc in solr_documents:
    bigrams = remove_stop_words(doc['bigrams'][0].split())
    if 'sessions' in doc:
        sessions = doc['sessions'][0]
    else:
        sessions = 'NAN'
    if 'category' in doc:
        category = doc['category'][0]
    else:
        category = 'NAN'
    document_list.append({ 'id': doc['id'],
                                   'text': bigrams,
                                   'year': str(doc['year'][0]),
                                   'title': doc['title'][0],
                                   'category': category.replace('<',''),
                                   'sessions':sessions})
    bigram_corpus.append(bigrams)

df = pd.DataFrame.from_dict(document_list)
axis_category = pd.DataFrame(df.groupby(['category', 'year'])['category'].count()).rename(columns={'category': 'count'})
print(axis_category)

In [None]:
# Cell 4: Using GENSIM to do topic modelling

from IPython.core.display import display, HTML

# num pases should be adjusted, 3 is just a guesstimate of when convergence will be achieved.
num_passes = 2
num_topics = 20
words_per_topic = 9
print_topics = False

filename = 'pyldaviz/' + parameters + '_passes' + str(num_passes) + '_topics' + str(num_topics) + '.html'

dictionary = corpora.Dictionary(bigram_corpus)
lda_corpus = [dictionary.doc2bow(text) for text in bigram_corpus]

lda_model = LdaMulticore(lda_corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         alpha=0.6,
                         passes=num_passes,
                         workers=2
                        )

topics = lda_model.print_topics(num_topics=num_topics, num_words=words_per_topic)
if print_topics:
    print ("Topic List: \n")
    for topic in topics:
        t = str((int(topic[0])+ 1))
        print('Topic ' + t + ': ', topic[1:])

import warnings
warnings.filterwarnings('ignore')

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
ldaviz = pyLDAvis.gensim.prepare(corpus=lda_corpus,
                        topic_model=lda_model,
                        dictionary=dictionary,
                        sort_topics=False)

print ("\nPyLDAVis: \n")
print('link to file: ')
display(HTML('<a href="{}" target="_blank">PyLDAviz</a> '.format(filename)))
pyLDAvis.save_html(ldaviz, filename)
pyLDAvis.display(ldaviz)

In [None]:
# Cell 5: listing topic terms ranked using lambda

topic = '1'
topic_lambda = 0.6
terms_number = 30

df = ldaviz.topic_info[ldaviz.topic_info.Category == 'Topic' + topic]
# relevance = topic_lambda * logprob + (1 - topic_lambda) * loglift
# as implemented on https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/_prepare.py

df = df.assign(r = topic_lambda * df['logprob'] + (1- topic_lambda) * df['loglift']).sort_values('r', ascending=False).drop('r', axis=1)[0:terms_number]

df

In [None]:
# Cell 6: Listing papers containing a particular ngram

# use unigrams or bigrams
terms = set(['carbon_cycle'])

top_n = 10

def createLink(doc):
    baseURL = 'https://meetingorganizer.copernicus.org/EGU' + str(doc['year']) + '/' + doc['id'] + '.pdf'
    return baseURL

# Ode to Python's comprehension lists
matches  = [doc for doc in document_list if terms == set(doc['text']).intersection(terms)]

from IPython.core.display import display, HTML
# Let's predict the first 10 documents
for doc in matches[0:top_n]:
    display(HTML('<br>Abstract <a href="{}" target="_blank">{}</a> '.format(
        createLink(doc),
        doc['id'])))


### Now that we have a trained model we can classify a new unseen document.

In [None]:
# Cell 7: Classifying an unseen document using our GENSIM model

# For practical purposes we use a mocked up document but we can easily query Solr or another store to get the content we want to classify
# Eventually all this should be served in as a web service 
# taken from https://meetingorganizer.copernicus.org/EGU2018/EGU2014-2415.pdf

unseen_document = """
Waves  in  the  Southern  Ocean  are  the  largest  in  the  planet.  In  the  Southern  Hemisphere,  the  absence  of  large
landmasses at high latitudes allows the wind to feed energy into the ocean over a virtually unlimited fetch. The
enormous amount air-sea momentum exchanged over the Southern Ocean plays a substantial role on the global
climate. However, large biases affect the estimation of wave regime around the Antarctic continent making climate
prediction susceptible to uncertainty.
 """

parsed_doc = list(unseen_document.split())
vec = dictionary.doc2bow(parsed_doc)
predicted_topics = lda_model[vec]
predicted_topics = [(p[0]+1, p[1]) for p in predicted_topics]
print(predicted_topics)

### Benchmarking Model Coherence 

In [None]:
# Cell 8: Plotting model coherence, this takes some time depending on itertions and model used.


# lda_model.log_perplexity(lda_corpus)

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.models import CoherenceModel


def compute_coherence_values(dictionary, corpus, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print("Processing {0} topics \n".format(num_topics)
#         model = LdaModel(corpus=corpus,
#                           id2word=dictionary,
#                           num_topics=num_topics, 
#                           random_state=100,
#                           update_every=0,
#                           chunksize=100000,
#                           passes=1,
#                           alpha='auto',
#                           per_word_topics=False)
        
        model = LdaMulticore(corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         passes=5,
                         workers=8
                        )


        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=lda_corpus, texts=bigram_corpus, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary=dictionary,
                                                        corpus=lda_corpus,
                                                        start=3,
                                                        limit=30,
                                                        step=5)

limit=30; start=3; step=5;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
