# Download and concatenate project and abstract data

In [1]:
import time
nb_start_time = time.time()

import pandas as pd
import numpy as np
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

In [2]:
# working on Prince or locally?
%pwd

'/Users/bryant/Documents/nyuHpcTopicModeling'

In [3]:
# bring in project data, starting with 2000
start_time = time.time()

z = urlopen('https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2000.zip')
zipProjects = ZipFile(BytesIO(z.read())).extract('FedRePORTER_PRJ_C_FY2000.csv')
projects = pd.read_csv(zipProjects, skipinitialspace=True, encoding='utf-8')

# loop through rest of years, adding to original df

# setup
fiscal_years = ['2001','2002','2003','2004','2005','2006','2007','2008','2009',
    '2010','2011','2012','2013','2014','2015','2016','2017','2018']
prefix = 'FedRePORTER_PRJ_C_FY'

# concat function
for year in fiscal_years:
    file = prefix + year
    zipUrl = 'https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=' + file + '.zip'
    csvFile = prefix + year + '.csv'
    
    print('\n')
    print('Downloading ' + zipUrl)
    z = urlopen(zipUrl)
    
    print('Extracting ' + csvFile)
    zipProjects = ZipFile(BytesIO(z.read())).extract(file + '.csv')
    
    print('Appending ' + file)
    projects = projects.append(pd.read_csv(csvFile, skipinitialspace=True, encoding='utf-8'), ignore_index=True)
    
    print('\n')
    print(projects.shape)

elapsed_time = time.time() - start_time
print('\n')
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2001.zip
Extracting FedRePORTER_PRJ_C_FY2001.csv
Appending FedRePORTER_PRJ_C_FY2001


(1008, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2002.zip
Extracting FedRePORTER_PRJ_C_FY2002.csv
Appending FedRePORTER_PRJ_C_FY2002


(1359, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2003.zip
Extracting FedRePORTER_PRJ_C_FY2003.csv
Appending FedRePORTER_PRJ_C_FY2003


(1669, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2004.zip
Extracting FedRePORTER_PRJ_C_FY2004.csv
Appending FedRePORTER_PRJ_C_FY2004


(2310, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2005.zip
Extracting FedRePORTER_PRJ_C_FY2005.csv
Appending FedRePORTER_PRJ_C_

  interactivity=interactivity, compiler=compiler, result=result)




(108242, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2009.zip
Extracting FedRePORTER_PRJ_C_FY2009.csv
Appending FedRePORTER_PRJ_C_FY2009


(226730, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2010.zip
Extracting FedRePORTER_PRJ_C_FY2010.csv
Appending FedRePORTER_PRJ_C_FY2010


(338130, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2011.zip
Extracting FedRePORTER_PRJ_C_FY2011.csv
Appending FedRePORTER_PRJ_C_FY2011


  interactivity=interactivity, compiler=compiler, result=result)




(436226, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2012.zip
Extracting FedRePORTER_PRJ_C_FY2012.csv
Appending FedRePORTER_PRJ_C_FY2012


(529629, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2013.zip
Extracting FedRePORTER_PRJ_C_FY2013.csv
Appending FedRePORTER_PRJ_C_FY2013


  interactivity=interactivity, compiler=compiler, result=result)




(621318, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2014.zip
Extracting FedRePORTER_PRJ_C_FY2014.csv
Appending FedRePORTER_PRJ_C_FY2014


(712307, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2015.zip
Extracting FedRePORTER_PRJ_C_FY2015.csv
Appending FedRePORTER_PRJ_C_FY2015


(804620, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2016.zip
Extracting FedRePORTER_PRJ_C_FY2016.csv
Appending FedRePORTER_PRJ_C_FY2016


(895761, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2017.zip
Extracting FedRePORTER_PRJ_C_FY2017.csv
Appending FedRePORTER_PRJ_C_FY2017


(983017, 24)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJ_C_FY2018.zip
Extracting FedRePORTER_PRJ_C_FY2018.csv
Appen

In [4]:
# bring in abstract data, starting with 2000
start_time = time.time()

z = urlopen('https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2000.zip')
zipAbstracts = ZipFile(BytesIO(z.read())).extract('FedRePORTER_PRJABS_C_FY2000.csv')
abstracts = pd.read_csv(zipAbstracts, skipinitialspace=True, encoding='utf-8')

# loop through rest of years, adding to original df

# setup
fiscal_years = ['2001','2002','2003','2004','2005','2006','2007','2008','2009',
    '2010','2011','2012','2013','2014','2015','2016','2017','2018']
prefix = 'FedRePORTER_PRJABS_C_FY'

# concat function
for year in fiscal_years:
    file = prefix + year
    zipUrl = 'https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=' + file + '.zip'
    csvFile = prefix + year + '.csv'
    
    print('\n')
    print('Downloading ' + zipUrl)
    z = urlopen(zipUrl)
    
    print('Extracting ' + csvFile)
    zipAbstracts = ZipFile(BytesIO(z.read())).extract(file + '.csv')
    
    print('Appending ' + file)
    abstracts = abstracts.append(pd.read_csv(csvFile, skipinitialspace=True, encoding='utf-8'), ignore_index=True)
    print('\n')
    print(abstracts.shape)

elapsed_time = time.time() - start_time

print('\n')
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2001.zip
Extracting FedRePORTER_PRJABS_C_FY2001.csv
Appending FedRePORTER_PRJABS_C_FY2001


(876, 2)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2002.zip
Extracting FedRePORTER_PRJABS_C_FY2002.csv
Appending FedRePORTER_PRJABS_C_FY2002


(1158, 2)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2003.zip
Extracting FedRePORTER_PRJABS_C_FY2003.csv
Appending FedRePORTER_PRJABS_C_FY2003


(1400, 2)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2004.zip
Extracting FedRePORTER_PRJABS_C_FY2004.csv
Appending FedRePORTER_PRJABS_C_FY2004


(1973, 2)


Downloading https://federalreporter.nih.gov/FileDownload/DownloadFile?fileToDownload=FedRePORTER_PRJABS_C_FY2005.zip
Extracting FedRePORTER_PRJABS_C_FY

In [5]:
start_time = time.time()

# merge projects and abstracts by PROJECT_ID
merged = pd.merge(projects, abstracts, on='PROJECT_ID')

print(merged.shape)
print('\n')
elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

(1040239, 25)


00:00:58


# clean up before saving

In [6]:
merged.shape

(1040239, 25)

In [7]:
# note that 2370 projects span multiple years
repeats = merged.groupby('PROJECT_NUMBER').size().reset_index(name='count')
print(repeats[repeats['count'] == 1].shape)
print(repeats[repeats['count'] == 2].shape)
print(repeats[repeats['count'] == 3].shape)
print(repeats[repeats['count'] == 4].shape)
print(repeats[repeats['count'] == 5].shape)
print(repeats[repeats['count'] == 6].shape)
print(repeats[repeats['count'] == 7].shape)
print(repeats[repeats['count'] == 8].shape)
print(repeats[repeats['count'] == 9].shape)
print(repeats[repeats['count'] == 10].shape)
print(repeats[repeats['count'] == 11].shape)
print(repeats[repeats['count'] == 12].shape)
print(repeats[repeats['count'] == 13].shape)
print(repeats[repeats['count'] == 14].shape)


(2370, 2)

In [8]:
# only keep the first instance, in cases of same project spanning multiple years
deduped = merged.groupby('PROJECT_NUMBER').first().reset_index()
deduped.shape

(1036237, 25)

In [39]:
# only keep major projects (over $100k)
deduped[deduped.FY_TOTAL_COST < 100000].shape

(148769, 25)

In [40]:
deduped_majors = deduped[deduped.FY_TOTAL_COST > 100000]
deduped_majors.shape


(694788, 25)

In [None]:
# export csv with progress so far
start_time = time.time()

deduped_majors.to_csv('mergedProjectsAbstracts.csv',encoding='utf-8-sig')

print('\n')
elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [None]:
print('Time to run whole notebook: ')
elapsed_time = time.time() - nb_start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))