In [14]:
# adapted from Julia Lane course and https://stackabuse.com/python-for-nlp-topic-modeling/

import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords

nltk.download('stopwords') #download the latest stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bryant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
file = 'opioidRQ3_constructedDataset.csv'
df = (pd.read_csv(file,skipinitialspace=True,encoding='utf-8'))

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df09 = df[df.FY == 2009]
df10 = df[df.FY == 2010]
df11 = df[df.FY == 2011]
df12 = df[df.FY == 2012]
df13 = df[df.FY == 2013]
df14 = df[df.FY == 2014]
df15 = df[df.FY == 2015]
df16 = df[df.FY == 2016]
df17 = df[df.FY == 2017]
df18 = df[df.FY == 2018]

In [4]:
# prepare stopwords
eng_stopwords = stopwords.words('english')

# we can add our own stopwords here, but max_df should handle it for us...
domain_stopwords = ['experiments','exploration','exploratory','explore','experiment','findings','financial',
                   'experimental','finally','far','five','find','extent']

# modified_stopwords = eng_stopwords + domain_stopwords
modified_stopwords = eng_stopwords

In [5]:
# '''
# This section lays out what is happening on small chunks of data and is for explanation purposes only
# '''
# # subset to a few rows on three years' of data for fast run
# df09_small = df09[0:500]
# df10_small = df10[0:500]
# df11_small = df11[0:500]

# '''
# define a function that breaks each abstract into words/n-grams. 
# Each row is an abstract, each column a ngram, and each cell 
# indicates whether the ngram of that column is present in that row's abstract
# '''
# count_vect = CountVectorizer(
#     max_df=0.05, 
#     min_df=10, 
#     ngram_range = (0,2),
# #     max_features = 500,
#     stop_words=modified_stopwords)

# # run the function 2009 and turn resulting sparse matrix into a dataframe
# doc_term_matrix_09 = count_vect.fit_transform(df09_small['ABSTRACT'].values.astype('U'))
# doc_term_df_09 = pd.DataFrame(doc_term_matrix_09.toarray(),columns=count_vect.get_feature_names(),index=df09_small.PROJECT_ID)

# doc_term_matrix_10 = count_vect.fit_transform(df10_small['ABSTRACT'].values.astype('U'))
# doc_term_df_10 = pd.DataFrame(doc_term_matrix_10.toarray(),columns=count_vect.get_feature_names(),index=df10_small.PROJECT_ID)

# doc_term_matrix_11 = count_vect.fit_transform(df11_small['ABSTRACT'].values.astype('U'))
# doc_term_df_11 = pd.DataFrame(doc_term_matrix_11.toarray(),columns=count_vect.get_feature_names(),index=df11_small.PROJECT_ID)

# # # append the resulting dataframes together
# result_raw = doc_term_df_09.append([doc_term_df_10, doc_term_df_11],sort=False)

# # # keep only n-grams that appear in every year of data
# result_ready = result_raw.dropna(axis=1)

# # # run LDA
# LDA = LatentDirichletAllocation(n_components=100, random_state=1)  
# LDA.fit(result_ready) 

# '''
# by design, the each topic consists of all words in the vocabulary, along with
# probability values. Here we print the 15 words with highest prov value within each topic.
# '''
# # initialize list for topics
# topicList = []

# for i,topic in enumerate(LDA.components_):  
#     ithTopic = [result_ready.columns[i] for i in topic.argsort()[-15:]]
#     topicList.append(ithTopic)
    
# pd.DataFrame(topicList)



In [6]:
'''
Actual run.

Before we can apply LDA, we need to create vocabulary of all the words in our data
We specify to only include those words that appear in less than 10% of the document 
and appear in at least 5% of documents. 
docs: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

The exercise of running the vectorizer on each year independently, then only
keeping ngrams that all 10 years of abstracts possess is all geared towards breaking up
the computation into manageable chunks that won't crash the kernel.
'''
# define our vectorizer
count_vect = CountVectorizer(
    max_df=0.10,
    min_df=0.005, 
    max_features = 1500,
    ngram_range = (0,2),
    stop_words=modified_stopwords)

# create a list of the latter 9 dataframes
chunks = [df10,df11,df12,df13,df14,df15,df16,df17,df18]

'''
in our eventual loop, we will be appending to a base dataframe, 
so we initialize it with 2009 data and append from there
'''
chunk = df09

# # base dataframe
# vectorize
doc_term_matrix = count_vect.fit_transform(chunk['ABSTRACT'].values.astype('U'))
# convert to pd
base_doc_term_df = pd.DataFrame(doc_term_matrix.toarray(),columns=count_vect.get_feature_names(),index=chunk.PROJECT_ID)

# vectorize and append rest of the years
for chunk in chunks:
    # vectorize
    doc_term_matrix = count_vect.fit_transform(chunk['ABSTRACT'].values.astype('U'))
    # convert to pd
    doc_term_df = pd.DataFrame(doc_term_matrix.toarray(),columns=count_vect.get_feature_names(),index=chunk.PROJECT_ID)
    # append
    result_raw = base_doc_term_df.append(doc_term_df,sort=False)
    # only keep only words/n-grams that appear in every year of data
    result_ready = result_raw.dropna(axis=1)
    # save result as base in preparation for next loop
    base_doc_term_df = result_ready
    print(base_doc_term_df.shape)
    
    

(219965, 1450)
(312931, 1376)
(399337, 1348)
(486146, 1324)
(572738, 1297)
(661267, 1272)
(750764, 1256)
(836570, 1236)
(928683, 1205)


In [7]:
'''
Use LDA to create topics.
'''
LDA = LatentDirichletAllocation(n_components=800, random_state=1)  
LDA.fit(base_doc_term_df) 

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=800, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=1, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [8]:
# initialize list for topics
topicList = []

for i,topic in enumerate(LDA.components_):  
    ithTopic = [result_ready.columns[i] for i in topic.argsort()[-15:]]
    topicList.append(ithTopic)
    
topics_full_run = pd.DataFrame(topicList)

In [9]:
'''
Assign the probability of all the topics to each document, then
add a column to the original data frame that will store the highest-scoring
topic for that abstract.
'''
topic_values = LDA.transform(base_doc_term_df)  
df['primeTopicId'] = topic_values.argmax(axis=1)


In [10]:
# which topics are most common among projects tagged explicitly?
pd.DataFrame(df[df.opioid_num == 1].primeTopicId.value_counts())[:10]

Unnamed: 0,primeTopicId
547,1269
575,282
398,278
562,225
308,219
741,216
221,185
153,182
125,153
303,113


In [15]:
# Which topic is affiliated with the most projects tagged with opioid project term?
topicList[547]

['modulation',
 'report',
 'poorly',
 'without',
 'condition',
 'treating',
 'therapies',
 'clinically',
 'problem',
 'reduce',
 'often',
 'treat',
 'treatments',
 'chronic',
 'pain']

In [1]:
# let's skim all the projects in both that topic and tagged with opioid in the project terms
# df[(df.opioid_num == 1) & (df.primeTopicId == 547)]

In [24]:
df.ABSTRACT[17606]

'DESCRIPTION (provided by applicant): Pain-related disorders cause an incalculable toll in human suffering and present a significant economic problem. The development of new treatments for these disorders is hindered by a lack of information about the basic neural mechanisms that process pain-related information. To date, investigations of the dynamic activation of mechanisms that facilitate pain have provided substantial insights into the neurophysiology and neuropharmacology of chronic pain. However, understanding of the dynamic response properties of inhibitory mechanisms has remained limited, despite the fact that disruption of inhibition may also contribute substantially to chronic pain. A recently identified analgesic phenomenon, offset analgesia, provides a powerful tool for the study of dynamic activation of inhibitory mechanisms. A series of psychophysical studies in humans subjects will systematically delineate the neurophysiological and neuropharmacological mechanisms that i

In [17]:
# see what we have before saving to a csv
df.head()

Unnamed: 0.1,Unnamed: 0,PROJECT_ID,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,naloxone,narcotic,opium,cocaine,codeine,pain,analgesics,sumTermCounts,tagCompare,primeTopicId
0,0,103915,base; Cities; Learning; Mission; next generati...,EDUCATION IN ACTION NASA EXCHANGE CITY LEARNIN...,NASA,NASA,,NNX09AR64G,10/1/2009,9/30/2010,...,0,0,0,0,0,0,0,0,neither,35
1,1,103916,Development; Future; programs; Science; Techno...,EDUCATIONAL ADVANCEMENT ALLIANCE INC MATH SCIE...,NASA,NASA,,NNX09AQ21G,8/1/2009,7/31/2010,...,0,0,0,0,0,0,0,0,neither,565
2,2,103917,Development; Educational process of instructin...,"CUBRC, INC FY09 EARMARK ENTITLED, ''TO CONTINU...",NASA,NASA,,NNX09AT31G,10/1/2010,9/30/2011,...,0,0,0,0,0,0,0,0,neither,32
3,3,103918,Joints; Life; programs; Request for Proposals;...,UNIVERSITY CORPORATION FOR ATMOSPHERIC RESEARC...,NASA,NASA,,NNX09AW48A,10/1/2009,9/30/2011,...,0,0,0,0,0,0,0,0,neither,113
4,4,103919,Area; base; Computer Architectures; design; De...,PLANNING FUTURE RESEARCH IN NETWORK SCIENCE AN...,NSF,NSF,,0962520,6/1/2009,2/28/2011,...,0,0,0,0,0,0,0,0,neither,46


In [18]:
df.to_csv('opioidRQ5_constructedDataset.csv')

In [21]:
pd.DataFrame(topicList).to_csv('opioidRQ5_topicList.csv')

In [19]:
eng_stopwords = stopwords.words('english')
stemmed_stopwords = []

for w in eng_stopwords:
    stemmed_stopwords.append(stemmer.stem(w))