-
Notifications
You must be signed in to change notification settings - Fork 6
/
bulk_data_api_demo.py
93 lines (73 loc) · 4.02 KB
/
bulk_data_api_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
import json
import sys
from datetime import datetime
from datetime import date
import os
# The bulk data api allows you to download prepackaged data sets. There are two endpoints for obtaining bulk data.
# The first is the /bulk-files endpoint which returns metadata about files. This metadata includes the path to the
# file. The second is the /easey/bulk-files endpoint which along with the path, returns the actual file.
# Set your API key here
API_KEY = 'YOUR_API_KEY'
# S3 bucket url base + s3Path (in get request) = the full path to the files
BUCKET_URL_BASE = 'https://api.epa.gov/easey/bulk-files/'
parameters = {
'api_key': API_KEY
}
# change this to the date you want to start downloading files from
# all files after this date and time will be downloaded
dateToday = date.today()
month, year = (dateToday.month-1, dateToday.year) if dateToday.month != 1 else (12, dateToday.year-1)
prevMonth = dateToday.replace(day=1, month=month, year=year)
timeOfLastDownload = datetime.fromisoformat(str(prevMonth)+"T00:00:00.000Z"[:-1] + '+00:00')
# executing get request
response = requests.get("https://api.epa.gov/easey/camd-services/bulk-files", params=parameters)
# printing the response error message if the response is not successful
print("Status code: "+str(response.status_code))
if (int(response.status_code) > 399):
sys.exit("Error message: "+response.json()['error']['message'])
# converting the content from json format to a data frame
resjson = response.content.decode('utf8').replace("'", '"')
bulkFiles = json.loads(resjson)
####### Meta Data #######
# print out unique data types in the bulk data files
print('Unique data types in the bulk data files:')
print(set([fileObj['metadata']['dataType'] for fileObj in bulkFiles]))
# select Mercury and Air Toxics Emissions (MATS) files
matsFiles = [fileObj for fileObj in bulkFiles if (fileObj['metadata']['dataType']=="Mercury and Air Toxics Emissions (MATS)")]
# print out unique data sub types in the bulk data files
print('Unique data sub types in the bulk data files:')
print(set([fileObj['metadata']['dataSubType'] for fileObj in matsFiles]))
# check if state groupings exist in any of the files
print('State groupings in the bulk data files:')
print(set([fileObj['metadata']['stateCode'] for fileObj in matsFiles if ('stateCode' in fileObj['metadata'].keys())]))
# check if quarterly groupings exist in any of the files
print('Quarterly groupings in the bulk data files:')
print(set([fileObj['metadata']['quarter'] for fileObj in matsFiles if ('quarter' in fileObj['metadata'].keys())]))
####### Hourly Emissions Files #######
# filter by emissions files
emissionsFiles = [fileObj for fileObj in bulkFiles if (fileObj['metadata']['dataType']=="Emissions")]
# filter by hourly virginia emissions files
hourlyEmissionsFiles = [fileObj for fileObj in emissionsFiles if (fileObj['metadata']['dataSubType']=="Hourly")]
virginiaHourlyEmissionsFiles = [fileObj for fileObj in hourlyEmissionsFiles if ('stateCode' in fileObj['metadata'].keys() and fileObj['metadata']['stateCode'] == 'VA')]
# filter files since last download (timeOfLastDownload)
filesToDownload = [fileObj for fileObj in virginiaHourlyEmissionsFiles if datetime.fromisoformat(fileObj['lastUpdated'][:-1] + '+00:00') > timeOfLastDownload]
print('Number of files to download: '+str(len(filesToDownload)))
# print the size of all files to download
downloadMB = sum(int(fileObj['megaBytes']) for fileObj in filesToDownload)
print('Total size of files to download: '+str(downloadMB)+' MB')
# make a data folder if it doesn't exist
if not os.path.exists('data'):
os.makedirs('data')
if len(filesToDownload) > 0:
# loop through all files and download them
for fileObj in filesToDownload:
url = BUCKET_URL_BASE+fileObj['s3Path']
print('Full path to file on S3: '+url)
# download and save file
response = requests.get(url)
# save file to disk in the data folder
with open('data/'+fileObj['filename'], 'wb') as f:
f.write(response.content)
else:
print('No files to download')