In [1]:
import time
nb_start_time = time.time()

import pandas as pd
import numpy as np
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# working on Prince or locally?
%pwd

'/Users/bryant/Documents/nyuHpcTopicModeling'

In [3]:
# read in data
start_time = time.time()

file = 'mergedProjectsAbstracts.csv'
df = (pd.read_csv(file,skipinitialspace=True,encoding='utf-8',
                 dtype={'PROJECT_ID': object,
                        'PROJECT_TERMS': object,
                        'PROJECT_TITLE': object,
                        'DEPARTMENT': str,
                        'AGENCY': str,
                        'PROJECT_START_DATE': str,
                        'PROJECT_END_DATE': str,
                        'ORGANIZATION_CITY': str,
                        'CFDA_CODE': str,
                        'FY': int,
                        'FY_TOTAL_COST': float,
                        'FY_TOTAL_COST_SUB_PROJECTS': float                     
                       }))

print('\n')
elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

  interactivity=interactivity, compiler=compiler, result=result)




00:00:45


In [4]:
# drop if abstract missing
df_nomiss = df[df.ABSTRACT.notnull()]
print(df.shape)
print(df_nomiss.shape)

(1040239, 26)
(1032895, 26)


In [5]:
# subset for testing, otherwise leave commented
df_modeling = df_nomiss[:5000] # 1x

In [6]:
# define our vectorizer
my_vectorizer = CountVectorizer(max_df=0.10,
                                  min_df=0.005,
                                  ngram_range = (0,2))

In [7]:
start_time = time.time()

# set up
corpus = df_modeling.ABSTRACT

# vectorize
doc_term_matrix = my_vectorizer.fit_transform(corpus)
doc_term_features = my_vectorizer.get_feature_names()

# run LDA
LDA = LatentDirichletAllocation(n_topics=5, random_state=1)  
LDA.fit(doc_term_matrix)

# initialize list for topics
topicList = []
for i,topic in enumerate(LDA.components_):
    ithTopic = [doc_term_features[i] for i in topic.argsort()[-20:]]
    topicList.append(ithTopic)
topicListDf = pd.DataFrame(topicList)

# matrix where each row is an abstract, each column a topic. Each cell is value of that topic for that abstract.
topic_values = LDA.transform(doc_term_matrix)

# save each project's most relevant topic in a new variable, and its valence in a further variable
df_modeling['primeTopicId'] = topic_values.argmax(axis=1)
df_modeling['primeTopicValence'] = topic_values.max(axis=1)

elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



00:00:32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
df_modeling.head(40)

Unnamed: 0.1,Unnamed: 0,PROJECT_ID,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ABSTRACT,primeTopicId,primeTopicValence
0,0,1098983,Deposition; design; experimental study; Hydroc...,COMBUSTION CHAMBER DEPOSIT EFFECTS ON ENGINE H...,EPA,EPA,,R824970C002,1/1/2000,6/30/1997,...,,,,,2000,496012.0,,(1) To design a carefully-controlled experimen...,0,0.553078
1,1,1098991,Aerosols; Air Pollution; Atmosphere; Chemicals...,EXPERIMENTAL INVESTIGATION OF THE EVOLUTION OF...,EPA,EPA,,R824970C010,1/1/2000,5/31/1998,...,,,,,2000,75000.0,,The purpose of this research project is to con...,0,0.876265
2,2,1098989,Alcohols; Chemistry; Ethers; Experimental Mode...,FUNDAMENTAL STUDY ON HIGH TEMPERATURE CHEMISTR...,EPA,EPA,,R824970C008,1/1/2000,12/31/1995,...,,,,,2000,265605.0,,Experimental and modeling studies are performe...,4,0.707042
3,3,1098997,Air; Biological; Carbon; Chemicals; Gases; Liq...,INTEGRATING MODELS FOR PREDICTING POLLUTION TR...,EPA,EPA,,R825370C005,1/1/2000,1/1/2000,...,,,,,2000,,,The objective of this project is to develop a ...,4,0.76346
4,4,1099005,Area; base; Emerging Technologies; Formulation...,ENVIRONMENTAL ASPECTS OF POLYMER FORMULATIONS,EPA,EPA,,R825370C013,1/1/2000,1/1/2000,...,,,,,2000,,,An objective of this project is a survey and d...,4,0.983181
5,5,1099013,Copper; Copper Sulfate; Electroplating; Indust...,ASSESSMENT OF AN IN-LINE COPPER RECOVERY TECHN...,EPA,EPA,,R825370C022,1/1/2000,1/1/2000,...,,,,,2000,,,To investigate the ability of Continuous De-Io...,4,0.952124
6,6,1099021,Area; base; brass; cold temperature; Copper; E...,RECYCLE OF LEAD AND BASE METALS FROM METAL WAS...,EPA,EPA,,R825370C030,1/1/2000,1/1/2000,...,,,,,2000,,,The primary objective is to develop and optimi...,4,0.915071
7,7,1098999,Development; Molds; Names; Nature; Process; Pr...,CLEAN MANUFACTURING IN FOUNDRY MOLD AND CORE P...,EPA,EPA,,R825370C007,1/1/2000,1/1/2000,...,,,,,2000,,,The object of this project is to examine the c...,3,0.490099
8,8,1099007,Behavior; Fiber; Goals; Membrane; Names; Process,MASS TRANSFER BEHAVIOR OF UNCONFINED MEMBRANES,EPA,EPA,,R825370C015,1/1/2000,1/1/2000,...,,,,,2000,,,The goal of this project is to develop mass tr...,4,0.959441
9,9,1099015,Acetic Acids; design; Liquid substance; Membra...,MEMBRANE MODULE DESIGN FOR THE PERVAPORATION O...,EPA,EPA,,R825370C024,1/1/2000,1/1/2000,...,,,,,2000,,,The aim of this project is to construct the be...,4,0.817854


In [10]:
df_modeling.primeTopicId.value_counts()

3    1603
0    1196
4    1014
1     837
2     350
Name: primeTopicId, dtype: int64

In [14]:
# what does the top topic consist of?
topicList[4]

['technologies',
 'fuel',
 'oil',
 'contaminated',
 'gas',
 'metals',
 'was to',
 'compounds',
 'properties',
 'products',
 'phase',
 'metal',
 'low',
 'materials',
 'cost',
 'surface',
 'organic',
 'waste',
 'treatment',
 'energy']

In [21]:
df_modeling[df_modeling['PROJECT_ID'] == '1099085'].ABSTRACT

23    The linear, reversible models are unable to pr...
Name: ABSTRACT, dtype: object

In [26]:
df.AGENCY.value_counts()

NIH        816529
NSF        136603
NIFA        25783
NASA        16238
VA          10759
CDMRP        7462
ALLCDC       6645
AHRQ         5019
EPA          4170
FDA          3735
ARS          2339
IES          1850
NIDILRR      1137
FS           1039
DVBIC         516
ACF           261
CNRM          131
CCCRP          23
Name: AGENCY, dtype: int64

In [32]:
test = df.PROJECT_ID.duplicated().reset_index()

In [35]:
test[test['PROJECT_ID'] == True]

Unnamed: 0,index,PROJECT_ID


In [38]:
test = df.PROJECT_NUMBER.duplicated().reset_index()
test[test['PROJECT_NUMBER'] == True]

Unnamed: 0,index,PROJECT_NUMBER
1182,1182,True
2246,2246,True
2247,2247,True
2249,2249,True
2620,2620,True
2662,2662,True
2712,2712,True
2863,2863,True
2959,2959,True
3316,3316,True


In [41]:
df.iloc[1182]

Unnamed: 0                                                                 1182
PROJECT_ID                                                              1099773
PROJECT_TERMS                 Affect; Amphibia; anthropogenesis; Biological;...
PROJECT_TITLE                 DEVELOPMENTAL STABILITY IN AMPHIBIANS AS A BIO...
DEPARTMENT                                                                  EPA
AGENCY                                                                      EPA
IC_CENTER                                                                   NaN
PROJECT_NUMBER                                                       R829419E03
PROJECT_START_DATE                                                    10/1/2002
PROJECT_END_DATE                                                      9/30/2004
CONTACT_PI_PROJECT_LEADER                                LOGANATHAN, BOMMANNA G
OTHER_PIS                                                    WHITEMAN, HOWARD H
CONGRESSIONAL_DISTRICT                  

In [43]:
df[df.PROJECT_NUMBER == 'R829419E03']

Unnamed: 0.1,Unnamed: 0,PROJECT_ID,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ABSTRACT
1129,1129,1099774,Affect; Amphibia; anthropogenesis; Biological;...,DEVELOPMENTAL STABILITY IN AMPHIBIANS AS A BIO...,EPA,EPA,,R829419E03,10/1/2001,9/30/2004,...,,,,,,,2002,475136.0,,The objective of this research project was to ...
1182,1182,1099773,Affect; Amphibia; anthropogenesis; Biological;...,DEVELOPMENTAL STABILITY IN AMPHIBIANS AS A BIO...,EPA,EPA,,R829419E03,10/1/2002,9/30/2004,...,,,,,,,2003,165775.0,,The objective of this research project was to ...
