# GNIP Historical PowerTrack 2.0 API

In [8]:
JOB_DIR    = './gnip-keyword-query'
JOB_CONFIG = './gnip-keyword-query/keyword_query_job.yml'

In [9]:
import json, yaml, requests, sys, os, pprint
from requests.auth import HTTPBasicAuth

In [None]:
ENDPOINT = 'https://gnip-api.gnip.com/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs.json'
BASE_URI = 'https://gnip-api.gnip.com/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/'

class HistoricalPowerTrackJob():
    """http://support.gnip.com/apis/historical_api2.0/api_reference.html#Create"""
    def __init__(self,config):
        self.config = yaml.load(open(config,'r'))

        #Create authentication
        auth_file = yaml.load(open('/home/anderstj/credentials/gnip.yaml'))
        self.auth=HTTPBasicAuth(auth_file['username'],auth_file['password'])
    def load_rules(self):
        try:
            rules = json.load(open(JOB_DIR+"/"+self.config['rules_file'],'r'))
        except:
            print("ERROR parsing rules")
            sys.exit(1)
        return rules
    
    def make_payload(self):
        self.payload = {
            "publisher":"twitter",
            "streamType":"track_v2",
            "dataFormat":"activity-streams",

            "title":self.config['title'],
            "fromDate":self.config['from_date'],
            "toDate":self.config['to_date'],
            
            "rules": self.load_rules()
        }
    
    def testAPI(self):
        self.test_response = requests.get(ENDPOINT,auth=self.auth)
        return self.test_response.json()
    
    def status(self):
        self.status_response = requests.get(self.url, auth=self.auth)
        return self.status_response.json()
    
    def set_url(self,url):
        self.url = url
    
    def submit(self):
        self.make_payload()
        print("Submitting Job with payload:\n",self.payload)
        self.submit_response = requests.post(ENDPOINT,
                                               auth=self.auth,
                                               json=self.payload)
        return self.submit_response.json()
    
    def reject(self):
        self.reject_response = requests.put(self.url,auth=self.auth,json={"status":"reject"})
        return self.reject_response.json()
    
    def accept(self):
        self.accept_response = requests.put(self.url,auth=self.auth,json={"status":"accept"})
        return self.accept_response.json()

In [32]:
job = HistoricalPowerTrackJob(JOB_CONFIG)
job.make_payload()
pprint.pprint(job.payload)

{'dataFormat': 'activity-streams',
 'fromDate': '201609250000',
 'publisher': 'twitter',
 'rules': [{'tag': 'hurricane', 'value': 'hurricane OR ouragan'},
           {'tag': 'siklon', 'value': 'siklón'},
           {'tag': 'haiti', 'value': 'haiti'},
           {'tag': 'matthew', 'value': 'hurricanematthew OR matthew'},
           {'tag': 'mathieu', 'value': 'ouraganmathieu OR mathieu'},
           {'tag': 'matthewhaiti', 'value': 'matthewhaiti'},
           {'tag': 'matthew', 'value': '(matye ayiti) OR matyeayiti'},
           {'tag': 'salut', 'value': 'port salut'},
           {'tag': 'cayes', 'value': 'les cayes'},
           {'tag': 'jeremie', 'value': 'jeremie OR jérémie'},
           {'tag': 'prayforhaiti',
            'value': '(pray for haiti) OR prayforhaiti OR pray4haiti'},
           {'tag': 'jeremie', 'value': 'jeremie OR jérémie'},
           {'tag': 'tiburonpenninsula',
            'value': '(tiburon penninsula) OR tiburonpenninsula OR '
                     '(península d

# Historical Powertrack Request

## Part 1: Submit the job

In [25]:
#job.submit()

Submitting Job with payload:
 {'toDate': '201610220000', 'dataFormat': 'activity-streams', 'title': 'HurricaneMatthewKeywordQuery_v1', 'publisher': 'twitter', 'rules': [{'value': 'hurricane OR ouragan', 'tag': 'hurricane'}, {'value': 'siklón', 'tag': 'siklon'}, {'value': 'haiti', 'tag': 'haiti'}, {'value': 'hurricanematthew OR matthew', 'tag': 'matthew'}, {'value': 'ouraganmathieu OR mathieu', 'tag': 'mathieu'}, {'value': 'matthewhaiti', 'tag': 'matthewhaiti'}, {'value': '(matye ayiti) OR matyeayiti', 'tag': 'matthew'}, {'value': 'port salut', 'tag': 'salut'}, {'value': 'les cayes', 'tag': 'cayes'}, {'value': 'jeremie OR jérémie', 'tag': 'jeremie'}, {'value': '(pray for haiti) OR prayforhaiti OR pray4haiti', 'tag': 'prayforhaiti'}, {'value': 'jeremie OR jérémie', 'tag': 'jeremie'}, {'value': '(tiburon penninsula) OR tiburonpenninsula OR (península de tiburón)', 'tag': 'tiburonpenninsula'}, {'value': 'bahamas OR jamaica OR cuba', 'tag': 'bahamas_jamaica_cuba'}, {'value': '(pray for flor

{'reason': {'title': ['has already been taken']}, 'status': 'error'}

In [52]:
job.accept_response.json()

{'acceptedAt': '2016-10-23T22:23:41Z',
 'acceptedBy': 'jennings.anderson@colorado.edu',
 'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201609250000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28.json',
 'publisher': 'twitter',
 'quote': {'estimatedActivityCount': 14988000,
  'estimatedDurationHours': '10.0',
  'estimatedFileSizeMb': '8335.06',
  'expiresAt': '2016-10-30T22:23:12Z'},
 'requestedAt': '2016-10-23T22:17:07Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'status': 'accepted',
 'statusMessage': 'Job accepted and ready to be queued.',
 'streamType': 'track_v2',
 'title': 'HurricaneMatthewKeywordQuery_v1',
 'toDate': '201610220000'}

## Part 2: Get Job URL

In [34]:
job.url = 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28.json'
#job.submit_response
job.set_url = job.status['jobURL']
# This line should work.. I don't know why it's not? ... 
# but you can get the UUIDs for all running jobs (latest at the bottom from the testAPI() call.)
# job.url = job.submit_response.json()['jobURL']

## Part 3: Check Job Status (Wait for Quote to Finish)

In [37]:
job.status()

{'acceptedAt': '2016-10-23T22:23:41Z',
 'acceptedBy': 'jennings.anderson@colorado.edu',
 'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201609250000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28.json',
 'percentComplete': 0,
 'publisher': 'twitter',
 'quote': {'estimatedActivityCount': 14988000,
  'estimatedDurationHours': '10.0',
  'estimatedFileSizeMb': '8335.06',
  'expiresAt': '2016-10-30T22:23:12Z'},
 'requestedAt': '2016-10-23T22:17:07Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'status': 'running',
 'statusMessage': 'Job queued and being processed.',
 'streamType': 'track_v2',
 'title': 'HurricaneMatthewKeywordQuery_v1',
 'toDate': '201610220000'}

## Part 2: Accept or Reject the Job

In [36]:
#job.reject()
#job.accept()

{'acceptedAt': '2016-10-23T22:23:41Z',
 'acceptedBy': 'jennings.anderson@colorado.edu',
 'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201609250000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28.json',
 'publisher': 'twitter',
 'quote': {'estimatedActivityCount': 14988000,
  'estimatedDurationHours': '10.0',
  'estimatedFileSizeMb': '8335.06',
  'expiresAt': '2016-10-30T22:23:12Z'},
 'requestedAt': '2016-10-23T22:17:07Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'status': 'accepted',
 'statusMessage': 'Job accepted and ready to be queued.',
 'streamType': 'track_v2',
 'title': 'HurricaneMatthewKeywordQuery_v1',
 'toDate': '201610220000'}

## Part 3: Check the status again and see how it's doing..

In [55]:
job.status()

{'acceptedAt': '2016-10-23T22:23:41Z',
 'acceptedBy': 'jennings.anderson@colorado.edu',
 'account': 'CUResearch',
 'format': 'activity_streams',
 'fromDate': '201609250000',
 'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28.json',
 'percentComplete': 100,
 'publisher': 'twitter',
 'quote': {'estimatedActivityCount': 14988000,
  'estimatedDurationHours': '10.0',
  'estimatedFileSizeMb': '8335.06',
  'expiresAt': '2016-10-30T22:23:12Z'},
 'requestedAt': '2016-10-23T22:17:07Z',
 'requestedBy': 'jennings.anderson@colorado.edu',
 'results': {'activityCount': 13830252,
  'completedAt': '2016-10-24T00:30:05Z',
  'dataURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/6f8p6x5g28/results.json',
  'expiresAt': '2016-11-08T00:29:08Z',
  'fileCount': 3888,
  'fileSizeMb': '7956.12'},
 'status': 'delivered',
 'statusMessage': 'Job delivered and available for download.',
 'streamT

## Final Part: Retrieve the Results... Once job is complete!
Check the status of the job until the results.json file is ready and then save it.

In [56]:
status = job.status()
print("Found %d tweets"%(status['results']['activityCount']))

Found 13830252 tweets


In [67]:
result_files = requests.get(status['results']['dataURL'], auth=job.auth).json()
print("Files: %d\nSize: %d MB\nExpires: %s"%(result_files['urlCount'],result_files['totalFileSizeBytes']/1048576,result_files['expiresAt']))

Files: 3888
Size: 7956 MB
Expires: 2016-11-08T00:29:08Z


dict_keys(['expiresAt', 'totalFileSizeBytes', 'urlCount', 'urlList'])


In [64]:
with open(JOB_DIR+"/results.json",'w') as resultsFile: 
    json.dump(result_files,resultsFile)

## Test Credentials, GET Request
A get request to this endpoint with the proper authentication returns a list of our current jobs.

In [41]:
pprint.pprint(job.testAPI())

{'delivered': {'activityCount': 316869,
               'jobCount': 12,
               'jobDaysRun': 486,
               'period': 'trial',
               'since': '2016-08-31T15:49:34Z'},
 'jobs': [{'expiresAt': '2016-10-31T22:46:07Z',
           'fromDate': '201608010000',
           'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/vttpj4nx3y.json',
           'percentComplete': 100,
           'publisher': 'twitter',
           'status': 'delivered',
           'streamType': 'track_v2',
           'title': 'ruby_job_test',
           'toDate': '201608012000',
           'uuid': 'vttpj4nx3y'},
          {'expiresAt': '2016-10-24T19:29:21Z',
           'fromDate': '201109060000',
           'jobURL': 'https://gnip-api.gnip.com:443/historical/powertrack/accounts/CUResearch/publishers/twitter/jobs/td1ycaqex3.json',
           'percentComplete': 0,
           'publisher': 'twitter',
           'status': 'rejected',
           'stre