In [None]:
#Import modules
import requests
from datetime import datetime
import json
from termcolor import colored
import os


# Fetch the profiles of 50,000 dogs from PetFinder 

## Setting up

Before you start, decide where you want to keep the data collected from PetFinder (and the data generated from all of the analyses)

In [None]:
pf_data_folder = 'petfinder_data'

Sign up for API access through PetFinder, and they will provide an "API Key" *(client_id)* and "Secret" *(client_secret)*. Use these to request an access token, then save the value as "token"

In [None]:
data = {
    'grant_type': 'client_credentials',
    'client_id': '<YOUR API KEY>',
    'client_secret': '<YOUR SECRET HERE>',
}

token_response = requests.post('https://api.petfinder.com/v2/oauth2/token', data=data)

#decode the byte object that was output, giving us a dictionary
##note: we want token_response.content, as that is what contains our access token
token_resp_parsed = json.loads(token_response.content.decode())

token = token_resp_parsed['access_token']

#check that you actually retrieved a token. if you dont, force exit with an error message
if not token:
    raise Exception(colored('no token found', 'red'))


## Using the generated token, fetch the profiles of 50,000 dogs
**Note:** The absolute max you can retrieve in a day with basic API access is 100,000


Max profiles per page is 100, pages are requested one at a time, and basic access only allows for 1000 requests/day

### Collect 50,000 profiles by retrieving the most recent 500 pages of results

In [None]:
#create a list to append each page of data to
merged_results = []

#you can only retrieve 100 dogs per page, so create a loop to retrieve 100 pages of results
for num in range(1,501):
    headers = {
        'Authorization': f'Bearer {token}',
    }
    #only select dogs, and return 100 dog profiles per page. Select the page number corresponding to the loop num
    param = {
        'type': 'Dog',
        'limit': 100,  #the petfinder limit for dogs per page is 100
        'page': num,
    }
    
    response = requests.get('https://api.petfinder.com/v2/animals', headers=headers, params=param)

    #once your response is returned, decode it to convert it from a bytes object into a list of dictionaries (one dictionary per dog)
    ## .decode turns byte -> json string
    ## json.loads() turns json string -> list of dictionaries
    results = json.loads(response.content.decode())
    merged_results.extend(results['animals'])

    #because this process takes so long, print a message for every 100 pages retrieved
    if num == 1:
        print('PROGRESS...')
    if num in range(0, 501, 100):
        print(f'   {num} pages retrieved')

#record the date (in your time zone) that you collected the data
run_date = datetime.today().strftime('%Y-%m-%d')

#double check that you have all 50,000 results before saving this raw data as a json
if len(merged_results) < 50000:
    print(colored(f'WARNING! Only {len(merged_results)} profiles recovered', 'red'))
    
else:
    print(colored(f'Successfully recovered 50,000 profiles', 'green'))
    with open(f'{pf_data_folder}/{run_date}.json', 'w') as f:
            json.dump(merged_results, f, indent=4)


### Make sure all of the expected data is present and remove any duplicate profiles
**Note:** Wait until the previous chunk is complete (and the green text is printed) before continuing. It will take a while.

In [None]:
#create a new json in which there are no duplicate profiles, using "id" to determine if a profiles is unique
unique_ids = []
animals = []
for dog in merged_results:
    if any(unique_ids) == dog['id']:
        print(f'Dog number {dog["id"]} is a duplicate')
    else:
        unique_ids.append(dog['id'])
        animals.append(dog)

#check the number of dog profiles
print('number of dogs:', len(animals))


## Create a json file containing only specified data from each dog

Since there is a lot of extra information in the original json (and the data is in a nested format), create a new json where each dog has a single dictionary of only relevent information. 

This flattens the data and makes it easier to work with.

### Create list of dictionaries, with each dictionary containing only the relevent data for a single dog

In [None]:
#create empty list to put the dictionary for each dog into
dog_data = []

for dog in animals:
    #if you want to combine both breed fields into a list instead of having two seperate keys (primary_breed and secondary_breed)...
    ##combine both breeds in a list. If there isn't a secondary breed listed, then only include the first breed (dont include "None" from second breed)
    if dog['breeds']['secondary'] == None:
        breed_list = [dog['breeds']['primary']]
    else:
        breed_list = [dog['breeds']['primary'], dog['breeds']['secondary']]
    
    #extract the data you care about, storing it in a dictionary for each dog
    dog = {'id': dog['id'],
           'name': dog['name'],
           'gender': dog['gender'],
           'age': dog['age'],
           'breeds': breed_list,
           'spayed_neutered': dog['attributes']['spayed_neutered'],
           'date': dog['published_at'],
           'org_id': dog['organization_id'],
           'state': dog['contact']['address']['state'],
           'country': dog['contact']['address']['country'],
           #note: the date_fetched value may appear to be earlier than the date the profile was published because the publish date is in UTC time while date_fetched is in EST, which is 4 to 5 hours behind UTC (depending on daylight savings) 
           'date_fetched': run_date
        }
    #now add this dog's dictionary to the list
    dog_data.append(dog)

### Save the filtered results as a json

In [None]:
#write the output to a new file
filt_reslt_filename = 'filtered_results.json'

##if a file of that name already exists, add the date to avoid overwriting it
if os.path.exists(filt_reslt_filename):
    filt_reslt_filename = f'{run_date}_{filt_reslt_filename}'

with open(f'{pf_data_folder}/{filt_reslt_filename}', 'w') as f:
    json.dump(dog_data, f, indent=2)