# Efficient Yelp API Calls

## Adding Safeguards to our Data Extraction Workflow

## Import Libraries

In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

## Custom Function for processing

In [11]:
def create_json_file(JSON_FILE, delete_if_exists=False):

    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)

    ## If it DOES exist:
    if file_exists == True:

        ## Check if user wants to delete if exists
        if delete_if_exists == True:

            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)
## Recursive call to function after old file deleted
            create_json_file(JSON_FILE, delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")

    ## If it does NOT exist:
    else:

        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")

        ## CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)

        ## If JSON_FILE included a folder:
        if len(folder) > 0:
            # create the folder
            os.makedirs(folder, exist_ok=True)
        ## Save empty list to start the json file
        with open(JSON_FILE, "w") as f:
            json.dump([], f)


## Credentials and Accessing the API

In [2]:
# Load API Credentials
with open(r'/Users/OM22285/.secret/yelp_api.json') as f:   #use your path here!
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

**Define Search**

To allow us to easily perform different searches in the future, we will define variables for LOCATION and TERM set for our particular search conditions. Then, when we want to use a different location or term, we can just redefine these variables. This streamlines the code and makes it more readable and reproducible.

In [3]:
# set our API call parameters 
LOCATION = 'IL,IL'
TERM = 'Burger'

In [4]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Coding Dojo/Week 18/Data/results_in_progress_IL_Burger.json"
JSON_FILE

'Coding Dojo/Week 18/Data/results_in_progress_IL_Burger.json'

In [5]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")


[i] Coding Dojo/Week 18/Data/results_in_progress_IL_Burger.json not found. Saving empty list to file.


In [6]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


**Figure out how many pages of results we will need**


In [7]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
## How many results total?
total_results = results['total']
total_results

5500

In [9]:
## How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page

20

There are over 5500 businesses to retrieve from our API, and we can get 20 results at a time (per "page").l.

In [10]:
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

275

When this example was written, there were 5500 results and 20 results per page

5500 /20 = 275 p.

Now that we have our new function, we can use it with delete_if_exists=True to delete our previous results and start over. We will also need to repeat the steps to recreate our n-results, total_results, results_per_page, and n_pages variables that we created before our first attempted loop.

In [12]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

[!] Coding Dojo/Week 18/Data/results_in_progress_IL_Burger.json already exists. Deleting previous file...
[i] Coding Dojo/Week 18/Data/results_in_progress_IL_Burger.json not found. Saving empty list to new file.
- 0 previous results found.


275

In [13]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 50 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.2)

  0%|          | 0/275 [00:00<?, ?it/s]

Exceeded 50 api calls. Stopping loop.


## After the Loop has finished

**Convert .json to dataframe**
Load in the "results in progress" JSON file into a Dataframe.

In [14]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,xoi7Cw7FoknAx5p880RtWQ,au-cheval-chicago,Au Cheval,https://s3-media3.fl.yelpcdn.com/bphoto/td7RDA...,False,https://www.yelp.com/biz/au-cheval-chicago?adj...,8639,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.5,"{'latitude': 41.88466, 'longitude': -87.647668}","[delivery, pickup]",$$,"{'address1': '800 W Randolph St', 'address2': ...",13129294580,(312) 929-4580,3376.598427
1,k7Izs1AKrIUIYmN4oSCijQ,bianca-s-burgers-chicago,Bianca’s Burgers,https://s3-media3.fl.yelpcdn.com/bphoto/Hsm8nA...,False,https://www.yelp.com/biz/bianca-s-burgers-chic...,17,"[{'alias': 'burgers', 'title': 'Burgers'}]",5.0,"{'latitude': 41.902726, 'longitude': -87.690483}","[delivery, pickup]",,"{'address1': '2525 Division St', 'address2': '...",17737706251,(773) 770-6251,1080.104817
2,gzhkdb6YoiFm5s3vriG1AA,gretel-chicago,Gretel,https://s3-media1.fl.yelpcdn.com/bphoto/TcPX2g...,False,https://www.yelp.com/biz/gretel-chicago?adjust...,240,"[{'alias': 'beer_and_wine', 'title': 'Beer, Wi...",4.5,"{'latitude': 41.917275, 'longitude': -87.698577}","[delivery, pickup]",$$,"{'address1': '2833 W Armitage Ave', 'address2'...",17737703427,(773) 770-3427,2186.522108
3,7Es7EbTy_s1btAdowLeiPQ,small-cheval-wicker-park-chicago,Small Cheval- Wicker Park,https://s3-media3.fl.yelpcdn.com/bphoto/oyVPVN...,False,https://www.yelp.com/biz/small-cheval-wicker-p...,1399,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",4.5,"{'latitude': 41.9128227233887, 'longitude': -8...","[delivery, pickup]",$$,"{'address1': '1732 N Milwaukee Ave', 'address2...",13128373859,(312) 837-3859,910.982141
4,Ydf5dgFsGhMSP61Ht7TekA,butcher-and-the-burger-chicago,Butcher & The Burger,https://s3-media4.fl.yelpcdn.com/bphoto/miImbr...,False,https://www.yelp.com/biz/butcher-and-the-burge...,1131,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",4.0,"{'latitude': 41.91787, 'longitude': -87.65423}","[delivery, pickup]",$$,"{'address1': '1021 W Armitage Ave', 'address2'...",17736973735,(773) 697-3735,2406.33877


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
995,wjACjySYGlQ8_mx8it3FQw,mr-broast-des-plaines,Mr Broast,https://s3-media2.fl.yelpcdn.com/bphoto/TRHurv...,False,https://www.yelp.com/biz/mr-broast-des-plaines...,92,"[{'alias': 'burgers', 'title': 'Burgers'}]",3.5,"{'latitude': 42.0092501, 'longitude': -87.8858...","[delivery, pickup]",$$,"{'address1': '7104 Mannheim Rd', 'address2': '...",18478135797,(847) 813-5797,20760.011523
996,J_6biyUoo3wYneF3j4LKWw,osteria-via-stato-chicago,Osteria Via Stato,https://s3-media3.fl.yelpcdn.com/bphoto/h2EPXE...,False,https://www.yelp.com/biz/osteria-via-stato-chi...,1015,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 41.893095, 'longitude': -87.6282449}","[delivery, pickup]",$$,"{'address1': '620 N State St', 'address2': '',...",13126428450,(312) 642-8450,4310.411906
997,z8G5AfXYbJD2jQpSEF633g,michaels-original-pizzeria-and-tavern-chicago,Michael's Original Pizzeria & Tavern,https://s3-media4.fl.yelpcdn.com/bphoto/fzoCYS...,False,https://www.yelp.com/biz/michaels-original-piz...,806,"[{'alias': 'pubs', 'title': 'Pubs'}, {'alias':...",4.5,"{'latitude': 41.95681, 'longitude': -87.65173}","[delivery, pickup]",$$,"{'address1': '4091 N Broadway St', 'address2':...",17739294149,(773) 929-4149,6139.305618
998,C9g-N9uXfbCzz7Ru6y3hsA,firehouse-grill-chicago,Firehouse Grill,https://s3-media1.fl.yelpcdn.com/bphoto/kyPOOh...,False,https://www.yelp.com/biz/firehouse-grill-chica...,36,"[{'alias': 'hotdog', 'title': 'Hot Dogs'}, {'a...",3.5,"{'latitude': 41.866155, 'longitude': -87.765773}","[pickup, delivery]",$,"{'address1': '5646 W Roosevelt Rd', 'address2'...",17736260600,(773) 626-0600,8478.720509
999,qN-JQU24tZJniMm34slBFg,torali-chicago-2,Torali,https://s3-media1.fl.yelpcdn.com/bphoto/ZlePXo...,False,https://www.yelp.com/biz/torali-chicago-2?adju...,50,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.0,"{'latitude': 41.89777438629329, 'longitude': -...",[delivery],$$$,"{'address1': '160 E Pearson St', 'address2': '...",13125735160,(312) 573-5160,4655.257231


**Check for duplicates** 
Check for and remove any duplicate results.

In [15]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()

0

In [16]:
## Drop duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

0

**Save the final DataFrame to a .csv (or a csv.gz if it's too big for the GitHub file size limit)**

In [17]:
# save the final results to a compressed csv
final_df.to_csv('Coding Dojo/Week 18/Data/results_in_progress_IL_Burgers.csv.gz', compression='gzip',index=False)