In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
# os - for saving and loading files
# json - to work with json files
# math - to round up results
# time - to add a short pause to not overwhelm the server
import os, json, math, time
from yelpapi import YelpAPI   # to make yelp api calls
from tqdm.notebook import tqdm_notebook   # to make a progress bar from tqdm_notebook

In [2]:
# CREDENTIALS AND ACCESSING THE API
with open('/Users/Rashad/.secret/yelp_api_rashadc.json') as rc:
    login = json.load(rc)

In [3]:
#quick display of the variable 'login' dictionary keys
login.keys()

dict_keys(['client-id', 'api-key'])

In [4]:
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)

In [5]:
### DEFINE SEARCH ###
# set your API call parameters 
LOCATION = 'Chicago'
TERM = 'deer'

In [6]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = "Data/results_in_progress_Chicago_pizza.json"

In [7]:
#Function to create .json file and check it one already exists.

def create_json_file(JSON_FILE,  delete_if_exists=False):
        
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)
    
    ## If it DOES exist:
    if file_exists == True:
        
        ## Check if user wants to delete if exists
        if delete_if_exists==True:
            
            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)
            ## Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")            
            
            
    ## If it does NOT exist:
    else:
        
        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
        ## CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)
        
        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## Save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)

# Creating JSON file and process search pages needed.

In [8]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)

[!] Data/results_in_progress_Chicago_pizza.json already exists. Deleting previous file...
[i] Data/results_in_progress_Chicago_pizza.json not found. Saving empty list to new file.


In [9]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)

In [10]:
#confirming empty json file was created.
print(f'- {n_results} previous results found.')

- 0 previous results found.


# API call to retrieve data.

In [11]:
# yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                                offset=n_results)

In [12]:
#quick view of data type
type(results)

dict

In [13]:
#quick view of dictionary keys
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [14]:
## How many did we get the details for?  This is the pagination size.
results_per_page = len(results['businesses'])

# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results) / results_per_page)

print(f'Total number of pages is {n_pages}')

Total number of pages is 41


In [15]:
print(f'n_results: {len(previous_results)} pluS {results_per_page} = {(len(previous_results) + results_per_page)}')

n_results: 0 pluS 20 = 20


# Exploring variable dictionary `results` to find target data.

In [16]:
results['businesses']

[{'id': 'PZe0q_153VHUnaR-8dOTJg',
  'alias': 'the-dearborn-chicago-2',
  'name': 'The Dearborn',
  'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/eSXeGiq2bRUjy7KOER4jeg/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/the-dearborn-chicago-2?adjust_creative=Rb-Ho11X0wH2IQkZpl-sCA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=Rb-Ho11X0wH2IQkZpl-sCA',
  'review_count': 2074,
  'categories': [{'alias': 'tradamerican', 'title': 'American (Traditional)'},
   {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'},
   {'alias': 'beer_and_wine', 'title': 'Beer, Wine & Spirits'}],
  'rating': 4.5,
  'coordinates': {'latitude': 41.8842528, 'longitude': -87.6293151},
  'transactions': ['delivery', 'pickup'],
  'price': '$$',
  'location': {'address1': '145 N Dearborn St',
   'address2': '',
   'address3': None,
   'city': 'Chicago',
   'zip_code': '60602',
   'country': 'US',
   'state': 'IL',
   'display_address': ['145 N Dearborn St', 'Chicago

# Loop with TQDM progress bar to load remaining pages into JSON file.

In [17]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 500 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(2.2)

  0%|          | 0/41 [00:00<?, ?it/s]

In [19]:
#Convert final JSON file to dataframe.
df = pd.read_json(JSON_FILE)
df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,PZe0q_153VHUnaR-8dOTJg,the-dearborn-chicago-2,The Dearborn,https://s3-media2.fl.yelpcdn.com/bphoto/eSXeGi...,False,https://www.yelp.com/biz/the-dearborn-chicago-...,2074,"[{'alias': 'tradamerican', 'title': 'American ...",4.5,"{'latitude': 41.8842528, 'longitude': -87.6293...","[delivery, pickup]",$$,"{'address1': '145 N Dearborn St', 'address2': ...",13123841242,(312) 384-1242,4633.348427
1,VFv_EfqRQN2RTh9Nmj3tNA,arctic-circle-taxidermy-chicago,Arctic Circle Taxidermy,https://s3-media3.fl.yelpcdn.com/bphoto/EE06-p...,False,https://www.yelp.com/biz/arctic-circle-taxider...,6,"[{'alias': 'taxidermy', 'title': 'Taxidermy'}]",5.0,"{'latitude': 41.9528579711914, 'longitude': -8...",[],,"{'address1': '5637 W Irving Park Rd', 'address...",17732868000,(773) 286-8000,9201.472588
2,WqKKvM3sUszaIqneaa8EKA,waterfall-glen-forest-preserve-darien-2,Waterfall Glen Forest Preserve,https://s3-media2.fl.yelpcdn.com/bphoto/HmSfAd...,False,https://www.yelp.com/biz/waterfall-glen-forest...,141,"[{'alias': 'parks', 'title': 'Parks'}]",4.5,"{'latitude': 41.723035754423414, 'longitude': ...",[],,{'address1': 'Intersection Of Cass And Northga...,16309337200,(630) 933-7200,31736.718353
3,Sb7Jk4c4jv7HLblNGRxqKQ,sideshow-gallery-chicago,Sideshow Gallery,https://s3-media4.fl.yelpcdn.com/bphoto/PBLs0J...,False,https://www.yelp.com/biz/sideshow-gallery-chic...,24,"[{'alias': 'galleries', 'title': 'Art Gallerie...",5.0,"{'latitude': 41.922305053523, 'longitude': -87...",[],$$,"{'address1': '2219 N Western Ave', 'address2':...",17732761300,(773) 276-1300,2063.047707
4,fLwNhXatcSoQEI8XeFbrtg,bangers-and-lace-wicker-park-chicago-2,Bangers & Lace Wicker Park,https://s3-media1.fl.yelpcdn.com/bphoto/fHPirO...,False,https://www.yelp.com/biz/bangers-and-lace-wick...,609,"[{'alias': 'pubs', 'title': 'Pubs'}, {'alias':...",4.0,"{'latitude': 41.90357, 'longitude': -87.67023}","[delivery, pickup]",$$,"{'address1': '1670 W Division St', 'address2':...",17732526499,(773) 252-6499,645.478132


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779 entries, 0 to 778
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             779 non-null    object 
 1   alias          779 non-null    object 
 2   name           779 non-null    object 
 3   image_url      779 non-null    object 
 4   is_closed      779 non-null    bool   
 5   url            779 non-null    object 
 6   review_count   779 non-null    int64  
 7   categories     779 non-null    object 
 8   rating         779 non-null    float64
 9   coordinates    779 non-null    object 
 10  transactions   779 non-null    object 
 11  price          493 non-null    object 
 12  location       779 non-null    object 
 13  phone          779 non-null    object 
 14  display_phone  779 non-null    object 
 15  distance       779 non-null    float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 92.2+ KB


# To convert the .json file to a compressed .csv.gz

In [None]:
# Convert the filename to a .csv.gz
#    csv_file = JSON_FILE.replace('.json', '.csv.gz')

# Save .csv file as a compressed csv
#    df.to_csv(csv_file, compression = 'gzip', index = False)