In [1]:
import time
nb_start_time = time.time()

import pandas as pd
import numpy as np
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# working on Prince or locally?
%pwd

'/scratch/bdr299/myjupyter'

In [3]:
# bring in project data
z = urlopen('https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2009.zip')
zipProjects = ZipFile(BytesIO(z.read())).extract('FedRePORTER_PRJ_C_FY2009.csv')
pr_09 = pd.read_csv(zipProjects, skipinitialspace=True, encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# bring in abstracts data
z = urlopen('https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2009.zip')
zipAbstracts = ZipFile(BytesIO(z.read())).extract('FedRePORTER_PRJABS_C_FY2009.csv')
ab_09 = pd.read_csv(zipAbstracts, skipinitialspace=True, encoding='utf-8')

In [5]:
# merge projects and abstracts by PROJECT_ID
merged_09 = pd.merge(pr_09, ab_09, on='PROJECT_ID')

# drop if abstract missing
merged_09 = merged_09[merged_09.ABSTRACT.notnull()]

In [6]:
my_vectorizer = CountVectorizer(max_df=0.10,
                                  min_df=0.005,
                                  ngram_range = (0,2))

In [7]:
start_time = time.time()

# set up
df_modeling = merged_09[:500] # 1x
corpus = df_modeling.ABSTRACT

# vectorize
doc_term_matrix = my_vectorizer.fit_transform(corpus)
doc_term_features = my_vectorizer.get_feature_names()

# run LDA
LDA = LatentDirichletAllocation(n_topics=50, random_state=1)  
LDA.fit(doc_term_matrix)

# initialize list for topics
topicList = []
for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
topicListDf = pd.DataFrame(topicList)

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)  
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



00:00:08


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
start_time = time.time()

# set up
df_modeling = merged_09[:1000] # 2x
corpus = df_modeling.ABSTRACT

# vectorize
doc_term_matrix = my_vectorizer.fit_transform(corpus)
doc_term_features = my_vectorizer.get_feature_names()

# run LDA
LDA = LatentDirichletAllocation(n_topics=50, random_state=1)  
LDA.fit(doc_term_matrix)

# initialize list for topics
topicList = []
for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
topicListDf = pd.DataFrame(topicList)

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)  
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



00:00:15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
start_time = time.time()

# set up
df_modeling = merged_09[:5000] # 10x
corpus = df_modeling.ABSTRACT

# vectorize
doc_term_matrix = my_vectorizer.fit_transform(corpus)
doc_term_features = my_vectorizer.get_feature_names()

# run LDA
LDA = LatentDirichletAllocation(n_topics=50, random_state=1)  
LDA.fit(doc_term_matrix)

# initialize list for topics
topicList = []
for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
topicListDf = pd.DataFrame(topicList)

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)  
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



00:01:19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
start_time = time.time()

# set up
df_modeling = merged_09[:50000] # 100x
corpus = df_modeling.ABSTRACT

# vectorize
doc_term_matrix = my_vectorizer.fit_transform(corpus)
doc_term_features = my_vectorizer.get_feature_names()

# run LDA
LDA = LatentDirichletAllocation(n_topics=50, random_state=1)  
LDA.fit(doc_term_matrix)

# initialize list for topics
topicList = []
for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
topicListDf = pd.DataFrame(topicList)

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)  
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


00:13:19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
nb_elapsed_time = time.time() - nb_start_time
print(time.strftime("%H:%M:%S", time.gmtime(nb_elapsed_time)))

00:15:17
