In [1]:
import time
import pandas as pd
import numpy as np
import re
import nltk
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

nltk.download('stopwords') #download the latest stopwords

notebook_start_time = time.time()

[nltk_data] Downloading package stopwords to
[nltk_data]     /nfshome/bdr299/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
# cd to the directory with data
# %cd '/path/to/your/data'

In [3]:
notebook_start_time = time.time()

# # import csv, if previous cells weren't run this session
start_time = time.time()

file = 'opioidRQ3_constructedDataset.csv'
df = (pd.read_csv(file,skipinitialspace=True,encoding='utf-8',
                 dtype={'PROJECT_ID': object,
                        'PROJECT_TERMS': object,
                        'PROJECT_TITLE': object,
                        'DEPARTMENT': str,
                        'AGENCY': str,
                        'PROJECT_START_DATE': str,
                        'PROJECT_END_DATE': str,
                        'ORGANIZATION_CITY': str,
                        'CFDA_CODE': str,
                        'FY': int,
                        'FY_TOTAL_COST': float,
                        'FY_TOTAL_COST_SUB_PROJECTS': float                     
                       }))

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

  interactivity=interactivity, compiler=compiler, result=result)


00:02:41


In [4]:
start_time = time.time()

df_modeling = df[(df.FY == 2009) | (df.FY == 2010)]
# df_modeling = df[:50000]
df_modeling = df_modeling[['PROJECT_ID', 'FY', 'ABSTRACT','opioid_num','sumTermCounts','tagCompare']]
corpus = df_modeling.ABSTRACT

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [5]:
start_time = time.time()

#stop words
stemmer = nltk.SnowballStemmer("english")
eng_stopwords = stopwords.words('english')
add_stopwords = ['understand','method']

comb_stopwords = eng_stopwords + add_stopwords

stemmed_stopwords = []

for w in comb_stopwords:
    stemmed_stopwords.append(stemmer.stem(w))
    
elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [6]:
start_time = time.time()
'''
Before we can apply LDA, we need to create a vocabulary with all the words in our data
We specify to only include those words that appear in less than 10% of the document 
and appear in at least 5% of documents. 
docs: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

The exercise of running the vectorizer on each year independently, then only
keeping ngrams that all 10 years of abstracts possess is all geared towards breaking up
the computation into manageable chunks that won't crash the kernel.
'''

stemmer = nltk.SnowballStemmer("english")
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words,
                                  max_df=0.10,
                                  min_df=0.005,
#                                   max_features = 1500,
                                  ngram_range = (0,2),
                                  stop_words=stemmed_stopwords)

doc_term_matrix = stem_vectorizer.fit_transform(corpus)
doc_term_features = stem_vectorizer.get_feature_names()
print('doc_term_matrix',doc_term_matrix)
# print('doc_term_features',doc_term_features)

# Using below code, we can print out the many, many words excluded due to:
# - occurred in too many documents (max_df)
# - occurred in too few documents (min_df)
# - were cut off by feature selection (max_features)

# print(count_vect.stop_words_)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

('doc_term_matrix', <219965x2666 sparse matrix of type '<type 'numpy.int64'>'
	with 13335995 stored elements in Compressed Sparse Row format>)
00:22:43


In [7]:
start_time = time.time()

'''
Use LDA to create topics.
'''
LDA = LatentDirichletAllocation(n_components=1000, random_state=1)  
LDA.fit(doc_term_matrix)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



04:25:03


In [50]:
start_time = time.time()

# initialize list for topics
topicList = []

for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
    
topicListDf = pd.DataFrame(topicList)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

00:00:00


In [None]:
start_time = time.time()

'''
Assign the probability of all the topics to each document, then
add a column to the original data frame that will store the highest-scoring
topic for that abstract.
'''

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)  

'''
take the column number associated with the highest value in a given row, store in our analytical dataframe
'''
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)

'''
store the valence of that prime topic for that abstract as well
'''
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

print("Shape of topic_values: ",(topic_values.shape))

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))


In [21]:
# which topics are most common among projects tagged explicitly?
pd.DataFrame(df_modeling[df_modeling.tagCompare == 'both'].primeTopicId.value_counts())[:10]

Unnamed: 0,primeTopicId
303,145
764,91
675,84
309,52
115,35
384,32
423,30
504,28
297,27
862,26


In [None]:
# what does the top topic consist of?
topicList[303]

In [35]:
df_modeling.shape

(219965, 8)

In [36]:
df_modeling[(df_modeling.tagCompare == 'both') & (df_modeling.primeTopicId == 303)].sort_values('primeTopicValence',ascending=False)


Unnamed: 0,PROJECT_ID,FY,ABSTRACT,opioid_num,sumTermCounts,tagCompare,primeTopicId,primeTopicValence
178536,353097,2010,DESCRIPTION (provided by applicant): This is a...,1.0,45,both,303,0.224191
71485,286376,2009,DESCRIPTION (provided by applicant): This is a...,1.0,45,both,303,0.224191
174519,347653,2010,DESCRIPTION (provided by applicant): In this...,1.0,27,both,303,0.223223
71641,286895,2009,DESCRIPTION (provided by applicant): In this...,1.0,27,both,303,0.223223
31590,218592,2009,DESCRIPTION (provided by applicant): A little ...,1.0,18,both,303,0.211776
135044,286997,2010,DESCRIPTION (provided by applicant): A little ...,1.0,18,both,303,0.211776
178875,353719,2010,DESCRIPTION (provided by applicant): the propo...,1.0,23,both,303,0.205366
146427,302242,2010,DESCRIPTION (provided by applicant): Opioids a...,1.0,35,both,303,0.204698
35243,223691,2009,DESCRIPTION (provided by applicant): Opioids a...,1.0,35,both,303,0.204698
131800,282186,2010,This proposal for a K23 Mentored Patient-Orie...,1.0,27,both,303,0.203818


In [37]:
df_modeling.ABSTRACT[178536]

u"DESCRIPTION (provided by applicant): This is a new R01 application, submitted in response to RFA-DA-09-017, to study the neurobiology of pain, the neuropharmacology of analgesia, and the interactions between analgesic and abuse-related effects of opioids and other drugs in rats. Pain is a significant public health problem, and opioid analgesics constitute a principal class of drugs used to treat pain. However, the use of existing opioids is limited by side effects that include high abuse liability, and efforts to develop strong analgesics with reduced abuse liability have met with limited success. We and others have argued that improved progress in pain management and analgesic drug development may benefit from research on the neurobiology and neuropharmacology of the affective components of pain. This application is founded on the premises that (1) a cardinal and clinically significant sign of pain is depression of both behavior and mood, and (2) a key goal in pain treatment is a re

In [47]:
df_modeling[(df_modeling.tagCompare == 'neither') & 
           (df_modeling.primeTopicId == 303)].sort_values('primeTopicValence',ascending=False)

Unnamed: 0,PROJECT_ID,FY,ABSTRACT,opioid_num,sumTermCounts,tagCompare,primeTopicId,primeTopicValence
219592,723505,2010,This funding is in support of informatics acti...,,0,neither,303,0.667000
214169,595657,2010,TO UPDATE ANIMAL HEALTH COMPONENT ONLY,,0,neither,303,0.500500
1453,105374,2009,University of Central Florida Improved Aquariu...,,0,neither,303,0.333500
92471,350100,2009,Informatics CoreCore functions1. Register and ...,,0,neither,303,0.333444
201,104116,2009,The Center for Intelligent Information Retriev...,,0,neither,303,0.287442
246,104161,2009,The Center for Intelligent Information Retriev...,,0,neither,303,0.287442
195,104110,2009,The Center for Intelligent Information Retriev...,,0,neither,303,0.287442
105,104020,2009,"FY09 EARMARK ENTITLED, `PLANETARIUM DIGITIZATI...",,0,neither,303,0.285857
124125,131634,2010,We propose to advance the readiness of the thr...,,0,neither,303,0.285857
86133,334649,2009,"Core D, Bioinformatics/Biostatistics is design...",,0,neither,303,0.252240


In [49]:
'''
Reading an example abstract highlighted in topic 828, but missed by both the explicit and wiki methods.
It seems relevant to opioids, indicating that the topic modeling helped include relevant abstracts
missed by our other two methods.
'''
df_modeling.ABSTRACT[201]


u'The Center for Intelligent Information Retrieval at UMass Amherst, the Perseus Digital Library Project at Tufts, and the Internet Archive are investigating large-scale information extraction and retrieval technologies for digitized book collections. To provide effective analysis and search for scholars and the general public, and to handle the diversity and scale of these collections, this project focuses on improvements in seven interlocking technologies: improved OCR accuracy through word spotting, creating probabilistic models using joint distributions of features, and building topic-specific language models across documents; structural metadata extraction, to mine headers, chapters, tables of contents, and indices; linguistic analysis and information extraction, to perform syntactic analysis and entity extraction on noisy OCR output; inferred document relational structure, to mine citations, quotations, translations, and paraphrases; latent topic modeling  through time, to improv

In [None]:
# export csv with progress so far
start_time = time.time()

# df_modeling.to_csv('opioidAnalyticalData_09_topics.csv')

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [45]:
# was opioid in vocabulary?
matching = [s for s in doc_term_features if "opioid" in s]
matching

[u'opioid']

In [None]:
elapsed_time = time.time() - notebook_start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))