In [23]:
# adapted from Julia Lane course and https://stackabuse.com/python-for-nlp-topic-modeling/

import pandas as pd
import numpy as np
import re
import nltk
import time
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords

nltk.download('stopwords') #download the latest stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bryant/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
start_time = time.time()
# 10 years of project data
fiscal_years = ['2010','2011','2012','2013','2014','2015','2016','2017','2018']
prefix = 'FedRePORTER_PRJ_C_FY'
suffix = '.csv'

# initialize dataframe with fy09 data
file = 'FedRePORTER_PRJ_C_FY2009.csv'
print('Reading in ' + file)
projects_df = (pd.read_csv(file,skipinitialspace=True,encoding='utf-8'))

# concatenate 10 years of data
for year in fiscal_years:
    file = prefix + year + suffix
    print('Reading in ' + file)
    projects_df = projects_df.append(pd.read_csv(file, skipinitialspace=True, encoding='utf-8'), ignore_index=True)

# new variable is 1 for rows with opioid in project term column
projects_df['opioid'] = np.where(
    projects_df['PROJECT_TERMS'].str.contains("opioid",case=False, na=False), 1, '')

# create a numeric version of our flag
projects_df['opioid_num'] = pd.to_numeric(projects_df['opioid'])

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))


In [None]:
start_time = time.time()

# 10 years of abstracts data
fiscal_years = ['2010','2011','2012','2013','2014','2015','2016','2017','2018']
prefix = 'FedRePORTER_PRJABS_C_FY'
suffix = '.csv'

# initialize dataframe with fy09 data
file = 'FedRePORTER_PRJABS_C_FY2009.csv'
print('Reading in ' + file)
abstracts_df = (pd.read_csv(file,skipinitialspace=True,encoding='utf-8'))

for year in fiscal_years:
    file = prefix + year + suffix
    print('Reading in ' + file)
    abstracts_df = abstracts_df.append(pd.read_csv(file, skipinitialspace=True, encoding='utf-8'), ignore_index=True)

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

In [None]:
# Affilated terms from https://en.wikipedia.org/wiki/Opioid
opioid_terms = ['opioid','morphine','pain relief','anesthesia',
                'overdose','addiction','withdrawal',
                'controlled substance','over-prescription','heroin',
               'opiate','hydrocodone','oxycodone','fentanyl','naloxone',
               'narcotic','opium','cocaine','codeine','pain',
                'analgesics']

In [None]:
start_time = time.time()

# keep only variables needed and remove missing values
abstracts_10yrs = abstracts_df.dropna()

# cut the end coding that interferes with cleaning script
abstracts_10yrs = abstracts_10yrs[0:-1]

#get rid of the punctuations and set all characters to lowercase
nonchars = re.compile( r'\W+|\d+' )

def clean(text):
    return re.sub(nonchars, " ", text).lower()

abstracts_10yrs['cleanText'] = abstracts_10yrs['ABSTRACT'].apply(clean)

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

In [None]:
start_time = time.time()

# Count the appearances of our defined terms in each abstract
def countTerm(text):
    return len(re.findall(term,text))

for term in opioid_terms:
    abstracts_10yrs[term] = abstracts_10yrs['cleanText'].apply(countTerm)
    
# sum of all term frequencies by abstract
abstracts_10yrs['sumTermCounts'] = abstracts_10yrs[opioid_terms].sum(axis=1)

print('The number of abstracts with three or more opioid terms is: ' +
     str(abstracts_10yrs[abstracts_10yrs['sumTermCounts']>2].shape[0]))

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

In [None]:
start_time = time.time()

# merge projects and abstracts by PROJECT_ID
merged_df_10yrs = pd.merge(projects_df, abstracts_10yrs, on='PROJECT_ID')

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

In [None]:
# define compareTags function

wikiThreshold = 2;

def compareTags(row):
    if (row['opioid_num'] == 1) & (row['sumTermCounts'] > wikiThreshold):
        return 'both'
    if row['opioid_num'] == 1:
        return 'explicitOnly'
    if row['sumTermCounts'] > wikiThreshold:
        return 'wikiOnly'
    return 'neither'

In [None]:
start_time = time.time()

# compare 10yr data tags
merged_df_10yrs['tagCompare'] = merged_df_10yrs.apply(compareTags, axis=1)

print('Number of projects')
merged_df_10yrs['tagCompare'].value_counts()

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

In [None]:
print('Cost sums')
print('Neither:       ' + str(merged_df_10yrs.FY_TOTAL_COST[merged_df_10yrs.tagCompare == 'neither'].sum()))
print('Both:          ' + str(merged_df_10yrs.FY_TOTAL_COST[merged_df_10yrs.tagCompare == 'both'].sum()))
print('Explicit only: ' + str(merged_df_10yrs.FY_TOTAL_COST[merged_df_10yrs.tagCompare == 'explicitOnly'].sum()))
print('Wiki only:     ' + str(merged_df_10yrs.FY_TOTAL_COST[merged_df_10yrs.tagCompare == 'wikiOnly'].sum()))

In [None]:
# export csv with progress so far
start_time = time.time()
merged_df_10yrs.to_csv('part1_OpioidAnalyticalData.csv')

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

In [18]:
# # import csv, if previous cells weren't run this session
start_time = time.time()

file = 'opioidRQ3_constructedDataset.csv'
df = (pd.read_csv(file,skipinitialspace=True,encoding='utf-8',
                 dtype={'PROJECT_ID': object,
                        'PROJECT_TERMS': object,
                        'PROJECT_TITLE': object,
                        'DEPARTMENT': str,
                        'AGENCY': str,
                        'PROJECT_START_DATE': str,
                        'PROJECT_END_DATE': str,
                        'ORGANIZATION_CITY': str,
                        'CFDA_CODE': str,
                        'FY': int,
                        'FY_TOTAL_COST': float,
                        'FY_TOTAL_COST_SUB_PROJECTS': float                     
                       }))

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

'00:05:26'

In [20]:
# re-chunk the data for topic modeling
start_time = time.time()

df09 = df[df.FY == 2009]
df10 = df[df.FY == 2010]
df11 = df[df.FY == 2011]
df12 = df[df.FY == 2012]
df13 = df[df.FY == 2013]
df14 = df[df.FY == 2014]
df15 = df[df.FY == 2015]
df16 = df[df.FY == 2016]
df17 = df[df.FY == 2017]
df18 = df[df.FY == 2018]

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

'00:00:41'

In [21]:
# prepare stopwords
eng_stopwords = stopwords.words('english')

# we can add our own stopwords here, but max_df should handle it for us...
domain_stopwords = ['experiments','exploration','exploratory','explore','experiment','findings','financial',
                   'experimental','finally','far','five','find','extent']

# modified_stopwords = eng_stopwords + domain_stopwords
modified_stopwords = eng_stopwords

In [None]:
'''
Actual run.

Before we can apply LDA, we need to create vocabulary of all the words in our data
We specify to only include those words that appear in less than 10% (max_df) of the document 
and appear in at least 5% (min_df) of documents. For computation reasons, limit to 'max_features.'
docs: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

The exercise of running the vectorizer on each year independently, then only
keeping ngrams that all 10 years of abstracts possess is all geared towards breaking up
the computation into manageable chunks that won't crash the kernel.
'''
# Create a Snowball stemmer 
stemmer = SnowballStemmer('english')

# define our vectorizer
count_vect = CountVectorizer(
    max_df=0.10,
    min_df=0.005, 
    max_features = 1500,
#     ngram_range = (0,2),
    stop_words=modified_stopwords)

# create a list of the latter 9 dataframes
chunks = [df10,df11,df12,df13,df14,df15,df16,df17,df18]

'''
in our eventual loop, we will be appending to a base dataframe, 
so we initialize it with 2009 data and append from there
'''
chunk = df09

# # base dataframe
# vectorize
doc_term_matrix = count_vect.fit_transform(chunk['ABSTRACT'].values.astype('U'))
# convert to pd
base_doc_term_df = pd.DataFrame(doc_term_matrix.toarray(),columns=count_vect.get_feature_names(),index=chunk.PROJECT_ID)

# vectorize and append rest of the years
for chunk in chunks:
    # vectorize
    doc_term_matrix = count_vect.fit_transform(chunk['ABSTRACT'].values.astype('U'))
    # convert to pd
    doc_term_df = pd.DataFrame(doc_term_matrix.toarray(),columns=count_vect.get_feature_names(),index=chunk.PROJECT_ID)
    # append
    result_raw = base_doc_term_df.append(doc_term_df,sort=False)
    # only keep only words/n-grams that appear in every year of data
    result_ready = result_raw.dropna(axis=1)
    # save result as base in preparation for next loop
    base_doc_term_df = result_ready
    print(base_doc_term_df.shape)
    

In [35]:
chunk = df09[0:500].ABSTRACT
       
# Create a Snowball stemmer 
stemmer = SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(chunk):
    return (stemmer.stem(w) for w in analyzer(chunk))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words,
                                  max_df=0.10,
                                  min_df=0.005,
                                  max_features = 1500,
                                  ngram_range = (0,2),
                                  stop_words=modified_stopwords
                                 )
print(stem_vectorizer.fit_transform(chunk))
print(stem_vectorizer.get_feature_names())


  (0, 800)	1
  (0, 187)	1
  (0, 1292)	1
  (0, 910)	1
  (0, 715)	1
  (0, 864)	1
  (0, 756)	1
  (0, 229)	1
  (0, 507)	1
  (0, 897)	3
  (0, 39)	1
  (1, 583)	1
  (1, 522)	1
  (1, 1491)	1
  (1, 578)	1
  (1, 825)	1
  (1, 698)	1
  (2, 1341)	1
  (2, 1008)	1
  (2, 571)	2
  (2, 481)	1
  (2, 436)	1
  (2, 608)	1
  (2, 698)	1
  (2, 864)	1
  :	:
  (499, 538)	1
  (499, 1077)	1
  (499, 1025)	1
  (499, 966)	1
  (499, 707)	1
  (499, 994)	1
  (499, 805)	1
  (499, 976)	2
  (499, 292)	1
  (499, 454)	1
  (499, 1016)	1
  (499, 530)	2
  (499, 1357)	2
  (499, 1041)	2
  (499, 119)	1
  (499, 383)	1
  (499, 911)	1
  (499, 261)	1
  (499, 1236)	1
  (499, 429)	1
  (499, 1111)	5
  (499, 120)	1
  (499, 822)	1
  (499, 1414)	1
  (499, 910)	1
['000', '08', '09', '10', '100', '1000', '12', '13', '14', '15', '20', '2010', '21st', '25', '30', '3d', '40', '589', '64257', 'abil', 'abl', 'abov', 'abstract', 'abund', 'academ', 'academi', 'acceler', 'accept', 'access', 'accomplish', 'account', 'accumul', 'accur', 'accuraci', 'ac

In [59]:
start_time = time.time()

chunks = [df09,df10,df11]
chunk = chunks[0].ABSTRACT
       
# Create a Snowball stemmer 
stemmer = SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(chunk):
    return (stemmer.stem(w) for w in analyzer(chunk))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words,
                                  max_df=0.10,
                                  min_df=0.005,
                                  max_features = 1500,
                                  ngram_range = (0,2),
                                  stop_words=modified_stopwords
                                 )

# # base dataframe
# vectorize
doc_term_matrix = stem_vectorizer.fit_transform(chunk)

# convert to pd
base_doc_term_df = pd.DataFrame(doc_term_matrix.toarray(),columns=stem_vectorizer.get_feature_names(),index=chunks[0].PROJECT_ID)

# vectorize and append rest of the years
for chunk in chunks:
    chunk = chunk.ABSTRACT
    # vectorize
    doc_term_matrix = stem_vectorizer.fit_transform(chunk)
    # convert to pd
    doc_term_df = pd.DataFrame(doc_term_matrix.toarray(),columns=stem_vectorizer.get_feature_names(),index=chunks[0].PROJECT_ID)
    # append
    result_raw = base_doc_term_df.append(doc_term_df,sort=False)
    # only keep only words/n-grams that appear in every year of data
    result_ready = result_raw.dropna(axis=1)
    # save result as base in preparation for next loop
    base_doc_term_df = result_ready
    print(base_doc_term_df.shape)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

# base_doc_term_df.head()


(227960, 1500)


ValueError: Shape of passed values is (1500, 105985), indices imply (1500, 113980)