# Set up our libraries

In [1]:
import time
nb_start_time = time.time()

import pandas as pd
import numpy as np
import re
import nltk
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

nltk.download('stopwords') #download the latest stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bryant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# import our data

df = pd.read_csv('mergedProjectsAbstracts.csv',encoding='utf-8-sig')

print(df.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(1040239, 26)


# Part 1. Project Terms Contain 'Opioid'

The fedreporter data have a project_terms field. A subset of projects
contain 'opioid' in that field. Let's make a flag for that.


In [3]:
st = time.time()

df['projectTermsFlag'] = pd.to_numeric(np.where(
    df['PROJECT_TERMS'].str.contains("opioid", case=False, na=False), 1, ''))

print(time.strftime("%H:%M:%S", time.gmtime(time.time() - st)))

00:00:17


In [4]:
print('% flagged: ' + str(100*df['projectTermsFlag'].value_counts()[1]/len(df)))

% flagged: 0.8569184581620185


# Part 2. 'Wiki Approach'

If we read through the [opioid wikipedia page](https://en.wikipedia.org/wiki/Opioid) we can extract words to highlight in project abstracts that might indicate it is a project related to opioids.

In [None]:
# terms/n-grams gleaned from wikipedia
opioid_terms = ['opioid','opiate','morphine','heroin',
                'percocet','vicoprofen','dextromethorphan','loperamide',
                'naloxegol','hydrocodone','oxycodone','fentanyl',
                'naloxone','analgesics','carfentanil','benzodiazepines',
                'narcotic','opium','cocaine','codeine',
                'pain relief','cancer pain','anesthesia','chronic pain',
                'nerve pain','fibromyalgia','overdose','addiction',
                'withdrawal','dependence','recreational use','euphoria',
                'tolerance','controlled substance','over-prescription',
                'peripheral nervous system','psychoactive','agonist',
                'antagonist','blood-brain']

In [None]:
# clean up the abstracts for text matching
st = time.time()

# a regex pattern to help eliminate punctuation
nonchars = re.compile( r'\W+|\d+' )

# function to elim punctuation and set to lower case
def clean(text):
    return re.sub(nonchars, " ", text).lower()

# cut the end coding that interferes with cleaning function
# df = df[0:-1]

# drop nulls, as they don't help our analysis
df_denull = df[df.ABSTRACT.notnull()]

# may have to drop weird end coding that interferes with cleaning function
# df_denull = df_denull[0:-1]

df_denull['cleanText'] = df_denull['ABSTRACT'].apply(clean)

print(time.strftime("%H:%M:%S", time.gmtime(time.time() - st)))

00:04:34


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
st = time.time()

# Count the appearances of our defined terms in each abstract
def countTerm(text):
    return len(re.findall(term,text))

for term in opioid_terms:
    df_denull[term] = df_denull['cleanText'].apply(countTerm)
    
# sum of all term frequencies by abstract
df_denull['sumTermCounts'] = df_denull[opioid_terms].sum(axis=1)

# set term threshold
wikiThreshold = 2;

def wikiFlag(row):
    if row['sumTermCounts'] > wikiThreshold:
        return 1
    else:
        return None
    
df_denull['wikiTermsFlag'] = df_denull.apply(wikiFlag, axis=1)

print(time.strftime("%H:%M:%S", time.gmtime(time.time() - st)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
df_denull['wikiTermsFlag'].dtype

# Part 3. Text analysis (Topic modeling)

LDA (Latent Dirichlet Allocation) is a model used for discovering abstract topics from a collection of documents. These 'latent' topics can be discovered based on observed data -- words in the documents, in this case.

To surface these topics, we create a matrix where each document is a row and each column is a word in the corpus vocabulary. The corpus vocabulary is the universe of words present in any one or more documents in the corpus (minus chosen 'stopwords' that we consider to provide little information).

Each cell of the matrix would be a count of that word in that document. A variant increases the level of sophistication by using a normalized version of these counts known as TF-IDF. TF stands for term-frequency and TF-IDF is term-frequency times inverse document-frequency. In other words, we are not only looking for how often a word appears in a given document, but also whether this particular word is distinct across all the collections of documents (corpus). For example, intuitively we understand that words like "often" or "use" are more frequently encountered, but they are less informative (more semantically-vacuous) if we want to discern a particular topic of a document, as they might be frequently encounter across all text documents in a corpus. On the other hand, words which we will see less frequently across a collection of document might indicate that those words are specific to a particular document, and, therefore, constitute a basis for a topic.

We provide the model with this matrix and how many topics we want it to use. Think of it like a k-means clustering analog. The model will then iterate a specified number of times considering two distributions; 1) which words in the vocabulary are more or less probable to belong in a given topic and 2) which topic is more or less probable for a given document.

The main assumptions, if this all went over your head:
* each document consists of a mixture of topics, and
* each topic consists of a collection of words.


In [None]:
# set up our corpus (limit number of rows while perfecting code)
df_modeling = df_denull[:5000]
# df_modeling = df_denull # to run on whole thing...

# Get only the text of abstracts
corpus = df_modeling['ABSTRACT'].values.tolist()

In [None]:
# stop words and stemming them
stemmer = nltk.SnowballStemmer('english')
eng_stopwords = stopwords.words('english')
add_stopwords = ['understand','method']
comb_stopwords = eng_stopwords + add_stopwords
stemmed_stopwords = []

for w in comb_stopwords:
    stemmed_stopwords.append(stemmer.stem(w))

In [None]:
start_time = time.time()
'''
Before we can apply LDA, we need to create a vocabulary with all the words in our data
We specify to only include those words that appear in less than 10% of the document 
and appear in at least 0.5% of documents. 
docs: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

The exercise of running the vectorizer on each year independently, then only
keeping ngrams that all 10 years of abstracts possess is all geared towards breaking up
the computation into manageable chunks that won't crash the kernel.
'''

# prepare our vectorizer
analyzer = CountVectorizer().build_analyzer()

# our stemming function
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words,
                                  max_df=0.10,
                                  min_df=0.005,
#                                   max_features = 1500, # useful for limiting coderun duration
                                  ngram_range = (0,2),
                                  stop_words=stemmed_stopwords)

doc_term_matrix = stem_vectorizer.fit_transform(corpus)
doc_term_features = stem_vectorizer.get_feature_names()

print(time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))


In [None]:
'''
Using below code, we can print out the many, many words excluded due to:
- occurred in too many documents (max_df)
- occurred in too few documents (min_df)
- were cut off by feature selection (max_features)
'''

# print(count_vect.stop_words_)

In [None]:
start_time = time.time()

# Use LDA to create topics.

LDA = LatentDirichletAllocation(n_components=200, random_state=1)  
LDA.fit(doc_term_matrix)

print(time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))


In [None]:
# initialize list for topics
topicList = []

# fill out topics list as top 20 words in each topic
for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
    
topicListDf = pd.DataFrame(topicList)
# topicListDf.head()

In [None]:
start_time = time.time()

'''
Assign the probability of all the topics to each document, then
add a column to the original data frame that will store the highest-scoring
topic for that abstract.
'''

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)  

'''
take the column number associated with the highest value in a given row, store in our analytical dataframe
'''
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)

'''
store the valence of that prime topic for that abstract as well
'''
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

print("Shape of topic_values: ",(topic_values.shape))

print(time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))


In [None]:
# which topics are most common among projects tagged explicitly?
pd.DataFrame(df_modeling[df_modeling.wikiTermsFlag == 1].primeTopicId.value_counts())[:10]

In [None]:
# what does the top topic consist of?
topicList[77]

In [None]:
print('Time to run whole notebook: ')
elapsed_time = time.time() - nb_start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))