# GNIP Historical PowerTrack 2.0 API

In [248]:
JOB_DIR    = './gnip-geo-query'
JOB_CONFIG = './gnip-geo-query/geo_query_job.yml'

In [127]:
import json, yaml, requests, sys, os, pprint
from requests.auth import HTTPBasicAuth

In [128]:
ENDPOINT = 'https://gnip-api.gnip.com/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs.json'
BASE_URI = 'https://gnip-api.gnip.com/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/'

class HistoricalPowerTrackJob():
    """http://support.gnip.com/apis/historical_api2.0/api_reference.html#Create"""
    def __init__(self,config):
        self.config = yaml.load(open(config,'r'))

        #Create authentication
        auth_file = yaml.load(open('/home/anderstj/credentials/gnip.yaml'))
        self.auth=HTTPBasicAuth(auth_file['username'],auth_file['password'])
    def load_rules(self):
        try:
            rules = json.load(open(JOB_DIR+"/"+self.config['rules_file'],'r'))
        except:
            print("ERROR parsing rules")
            sys.exit(1)
        return rules
    
    def make_payload(self):
        self.payload = {
            "publisher":"twitter",
            "streamType":"track_v2",
            "dataFormat":"activity-streams",

            "title":self.config['title'],
            "fromDate":self.config['from_date'],
            "toDate":self.config['to_date'],
            
            "rules": self.load_rules()
        }
    
    def testAPI(self):
        self.test_response = requests.get(ENDPOINT,auth=self.auth)
        return self.test_response.json()
    
    def status(self):
        self.status_response = requests.get(self.url, auth=self.auth)
        return self.status_response.json()
    
    def submit(self):
        self.make_payload()
        print("Submitting Job with payload:\n",self.payload)
        self.submit_response = requests.post(ENDPOINT,
                                               auth=self.auth,
                                               json=self.payload)
        return self.submit_response.json()
    
    def reject(self):
        self.reject_response = requests.put(self.url,auth=self.auth,json={"status":"reject"})
        return self.reject_response.json()
    
    def accept(self):
        self.accept_response = requests.put(self.url,auth=self.auth,json={"status":"accept"})
        return self.accept_response.json()

In [129]:
job7 = HistoricalPowerTrackJob(JOB_CONFIG)
job7.make_payload()
pprint.pprint(job7.payload)

{'dataFormat': 'activity-streams',
 'fromDate': '201608010000',
 'publisher': 'twitter',
 'rules': [{'tag': 'id:1501,radii:undefined,center:[-72.4063411406391,11.605307266170179]',
            'value': 'bounding_box:[-72.56354631536 11.4513160976523 '
                     '-72.2491359659182 11.7592984346881]'},
           {'tag': 'id:1502,radii:undefined,center:[-72.09193079119726,11.605307266170179]',
            'value': 'bounding_box:[-72.2491359659182 11.4513160976523 '
                     '-71.9347256164763 11.7592984346881]'},
           {'tag': 'id:1503,radii:undefined,center:[-71.77752044175543,11.605307266170179]',
            'value': 'bounding_box:[-71.9347256164763 11.4513160976523 '
                     '-71.6203152670345 11.7592984346881]'},
           {'tag': 'id:1504,radii:undefined,center:[-71.4631100923136,11.605307266170179]',
            'value': 'bounding_box:[-71.6203152670345 11.4513160976523 '
                     '-71.3059049175927 11.7592984346881]'},
       

# Historical Powertrack Request

## Part 1: Submit the job

In [130]:
job7.submit()

Submitting Job with payload:
 {'title': 'HurricaneMatthewGeoQuery_NonWindSwath_1839', 'streamType': 'track_v2', 'publisher': 'twitter', 'rules': [{'tag': 'id:1501,radii:undefined,center:[-72.4063411406391,11.605307266170179]', 'value': 'bounding_box:[-72.56354631536 11.4513160976523 -72.2491359659182 11.7592984346881]'}, {'tag': 'id:1502,radii:undefined,center:[-72.09193079119726,11.605307266170179]', 'value': 'bounding_box:[-72.2491359659182 11.4513160976523 -71.9347256164763 11.7592984346881]'}, {'tag': 'id:1503,radii:undefined,center:[-71.77752044175543,11.605307266170179]', 'value': 'bounding_box:[-71.9347256164763 11.4513160976523 -71.6203152670345 11.7592984346881]'}, {'tag': 'id:1504,radii:undefined,center:[-71.4631100923136,11.605307266170179]', 'value': 'bounding_box:[-71.6203152670345 11.4513160976523 -71.3059049175927 11.7592984346881]'}, {'tag': 'id:1505,radii:undefined,center:[-71.14869974287176,11.605307266170179]', 'value': 'bounding_box:[-71.3059049175927 11.45131609765

{'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201608010000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6y1vwhh99c.json',
 'publisher': 'twitter',
 'requestedAt': '2016-10-24T18:25:27Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'rules': {'detail': [{'created': True,
    'rule': {'id': 3502338907936341693,
     'tag': 'id:1501,radii:undefined,center:[-72.4063411406391,11.605307266170179]',
     'value': 'bounding_box:[-72.56354631536 11.4513160976523 -72.2491359659182 11.7592984346881]'}},
   {'created': True,
    'rule': {'id': 8077805486665431729,
     'tag': 'id:1502,radii:undefined,center:[-72.09193079119726,11.605307266170179]',
     'value': 'bounding_box:[-72.2491359659182 11.4513160976523 -71.9347256164763 11.7592984346881]'}},
   {'created': True,
    'rule': {'id': 1579196786601163309,
     'tag': 'id:1503,radii:undefined,center:[-71.77752044175543,11.605307266170179]',
     'va

## Part 2: Get Job URL

In [131]:
job7.url = job7.submit_response.json()['jobURL']

In [132]:
# job.url = 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28.json'
# job.submit_response
# job.set_url = job.status['jobURL']
# This line should work.. I don't know why it's not? ... 
# but you can get the UUIDs for all running jobs (latest at the bottom from the testAPI() call.)
# job.url = job.submit_response.json()['jobURL']
# job1.url = 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/f31wsc2nvq.json'

In [133]:
[job1.url,job2.url,job3.url,job4.url,job5.url,job6.url,job7.url]

['https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/f31wsc2nvq.json',
 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/j6add94ne7.json',
 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/tk7vx3t9s2.json',
 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/cvqwjryar1.json',
 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/dg7y41zawr.json',
 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/cwa4td0455.json',
 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6y1vwhh99c.json']

In [245]:
job7.status()

{'acceptedAt': '2016-10-24T19:18:10Z',
 'acceptedBy': 'jennings.anderson@colorado.edu',
 'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201608010000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6y1vwhh99c.json',
 'percentComplete': 19,
 'publisher': 'twitter',
 'quote': {'estimatedActivityCount': 1511000,
  'estimatedDurationHours': '18.0',
  'estimatedFileSizeMb': '651.91',
  'expiresAt': '2016-10-31T19:14:45Z'},
 'requestedAt': '2016-10-24T18:25:27Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'status': 'running',
 'statusMessage': 'Job queued and being processed.',
 'streamType': 'track_v2',
 'title': 'HurricaneMatthewGeoQuery_NonWindSwath_1839',
 'toDate': '201610220000'}

## Part 2: Accept or Reject the Job

In [196]:
#job.reject()
job7.accept()

{'acceptedAt': '2016-10-24T19:18:10Z',
 'acceptedBy': 'jennings.anderson@colorado.edu',
 'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201608010000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6y1vwhh99c.json',
 'publisher': 'twitter',
 'quote': {'estimatedActivityCount': 1511000,
  'estimatedDurationHours': '18.0',
  'estimatedFileSizeMb': '651.91',
  'expiresAt': '2016-10-31T19:14:45Z'},
 'requestedAt': '2016-10-24T18:25:27Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'status': 'accepted',
 'statusMessage': 'Job accepted and ready to be queued.',
 'streamType': 'track_v2',
 'title': 'HurricaneMatthewGeoQuery_NonWindSwath_1839',
 'toDate': '201610220000'}

In [211]:
total = 0;
for j in [job1, job2, job3]:
    total += j.status()['quote']['estimatedActivityCount']
print("In Windswath total", total)

In Windswath total 3627000


In [212]:
total = 0;
for j in [job4, job5, job6, job7]:
    total += j.status()['quote']['estimatedActivityCount']
print("Out of Windswath total", total)

Out of Windswath total 6962000


## Part 3: Check the status again and see how it's doing.....

In [251]:
# In Windswath Jobs:
for j in [job1, job2, job3]:
    stat = j.status()
    print("%s... %s percent complete"%(stat['statusMessage'], stat['percentComplete']))

Job delivered and available for download.... 100 percent complete
Job delivered and available for download.... 100 percent complete
Job delivered and available for download.... 100 percent complete


In [252]:
# Outside of Windswath Jobs:
for j in [job4, job5, job6, job7]:
    stat = j.status()
    print("%s... %s percent complete"%(stat['statusMessage'], stat['percentComplete']))

Job delivered and available for download.... 100 percent complete
Job delivered and available for download.... 100 percent complete
Job delivered and available for download.... 100 percent complete
Job delivered and available for download.... 100 percent complete


## Final Part: Retrieve the Results... Once job is complete!
Check the status of the job until the results.json file is ready and then save it.

In [56]:
status = job.status()
print("Found %d tweets"%(status['results']['activityCount']))

Found 13830252 tweets


In [67]:
result_files = requests.get(status['results']['dataURL'], auth=job.auth).json()
print("Files: %d\nSize: %d MB\nExpires: %s"%(result_files['urlCount'],result_files['totalFileSizeBytes']/1048576,result_files['expiresAt']))

Files: 3888
Size: 7956 MB
Expires: 2016-11-08T00:29:08Z


In [64]:
with open(JOB_DIR+"/results.json",'w') as resultsFile: 
    json.dump(result_files,resultsFile)

## Handling Multiple Jobs

In [257]:
jobs = [
    {'title':'ws1','job':job1},
    {'title':'ws2','job':job2},
    {'title':'ws3','job':job3},
    {'title':'nws1','job':job4},
    {'title':'nws2','job':job5},
    {'title':'nws3','job':job6},
    {'title':'nws4','job':job7},
]

In [260]:
#Save all the URLS
for j in jobs:
    print(j['title'])
    status = j['job'].status()
    print("Found %d tweets"%(status['results']['activityCount']))
    result_files = requests.get(status['results']['dataURL'], auth=job.auth).json()
    print("Files: %d\nSize: %d MB\nExpires: %s"%(result_files['urlCount'],result_files['totalFileSizeBytes']/1048576,result_files['expiresAt']))
    with open(JOB_DIR+"/results/results_"+j['title']+".json",'w') as resultsFile: 
        json.dump(result_files,resultsFile)

ws1
Found 1791601 tweets
Files: 11808
Size: 703 MB
Expires: 2016-11-09T10:24:12Z
ws2
Found 1602960 tweets
Files: 11808
Size: 558 MB
Expires: 2016-11-09T12:28:14Z
ws3
Found 39922 tweets
Files: 9870
Size: 23 MB
Expires: 2016-11-09T11:36:13Z
nws1
Found 2698300 tweets
Files: 11808
Size: 1040 MB
Expires: 2016-11-09T11:51:16Z
nws2
Found 1453229 tweets
Files: 11808
Size: 543 MB
Expires: 2016-11-09T12:04:14Z
nws3
Found 991992 tweets
Files: 11807
Size: 347 MB
Expires: 2016-11-09T11:57:15Z
nws4
Found 1420024 tweets
Files: 11808
Size: 513 MB
Expires: 2016-11-09T13:39:12Z


## Test Credentials, GET Request
A get request to this endpoint with the proper authentication returns a list of our current jobs.

In [62]:
pprint.pprint(job.testAPI())

{'delivered': {'activityCount': 14147121,
               'jobCount': 13,
               'jobDaysRun': 513,
               'period': 'trial',
               'since': '2016-08-31T15:49:34Z'},
 'jobs': [{'expiresAt': '2016-10-31T22:46:07Z',
           'fromDate': '201608010000',
           'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/vttpj4nx3y.json',
           'percentComplete': 100,
           'publisher': 'twitter',
           'status': 'delivered',
           'streamType': 'track_v2',
           'title': 'ruby_job_test',
           'toDate': '201608012000',
           'uuid': 'vttpj4nx3y'},
          {'expiresAt': '2016-10-24T19:29:21Z',
           'fromDate': '201109060000',
           'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/td1ycaqex3.json',
           'percentComplete': 0,
           'publisher': 'twitter',
           'status': 'rejected',
           'st

In [231]:
res = job.testAPI()

In [232]:
for j in list(reversed(res['jobs']))[:7]:
    print(j['title'])
    print(j['status'])
    print(j['jobURL'])
    print("\n")

HurricaneMatthewGeoQuery_NonWindSwath_1839
running
https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6y1vwhh99c.json


HurricaneMatthewGeoQuery_NonWindSwath_1500
running
https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/cwa4td0455.json


HurricaneMatthewGeoQuery_NonWindSwath_1000
running
https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/dg7y41zawr.json


HurricaneMatthewGeoQuery_NonWindSwath_500
running
https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/cvqwjryar1.json


HurricaneMatthewGeoQuery_WindSwath_1286
running
https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/tk7vx3t9s2.json


HurricaneMatthewGeoQuery_WindSwath_1000
running
https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/j6add94ne7.json


HurricaneMatthewG