# Importing Libraries

In [1]:
import pandas as pd
from yelpapi import YelpAPI
import os, time, math, json
from tqdm.notebook import tqdm_notebook

This notebook is going to extract data from the Yelp API. Specifically the data for businesses in Miami FL that serve Cuban food. I will start by bringing in my credentials and instantiating the API

# Api

In [2]:
# Opening my API creds
with open('/Users/Ray/.secret/yelp_api.json', 'r') as f:
    login = json.load(f)
    
login.keys()

dict_keys(['client-id', 'api-key'])

In [3]:
# Instantiating the API
yelpcall = YelpAPI(login['api-key'], timeout_s = 5.0)
yelpcall

<yelpapi.yelpapi.YelpAPI at 0x21cac47b400>

In [4]:
# Setting the keywords for the API calls
location = 'Miami, FL'
term = 'Cuban'

# File 
Now to create a folder and a file to store the results of my api calls

In [5]:
# File name to use later on
json_file = 'Data/previous_results.json'

# IF statement to create the file
file_exists = os.path.isfile(json_file)
if file_exists == False:
    
    # this will create a folder for the json file IF the json file name had a folder
    folder = os.path.dirname(json_file)
    
    if len(folder)>0:
        os.makedirs(folder, exist_ok = True)
        
    # Print a status to the notebook    
    print(f'[info] {json_file} not found. Saving empty list to file')
    
    # Saving an empty list to json_file
    with open(json_file, "w") as f:
        json.dump(obj = [], fp = f)

else:
    print(f'[info] {json_file} already exists')


[info] Data/previous_results.json already exists


# Prepping to loop
Next step is going to be to calculate the pagination batch sizes and loop through the available data

In [6]:
with open(json_file, 'r') as f:
    previous_results = json.load(f)
    
n_results = len(previous_results)
print(f'{n_results} previous results were found in JSON file')

962 previous results were found in JSON file


In [7]:
# running the first call to get the total results and the total per page
results = yelpcall.search_query(location = location, term = term)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
# Total
total_results = results['total']

# Total per page
total_per_page = len(results['businesses'])

# Number of loops needed
n_loops = math.ceil((total_results - n_results)/total_per_page)

print(f'{total_results} total results')
print(f'{total_per_page} total results per page')
print(f'{n_loops} loops needed')

962 total results
20 total results per page
0 loops needed


# Constructing the loop
Now to create the loop, but first lets add the first results to the json file

In [9]:
# Extending previous results
previous_results.extend(results['businesses'])

# Saving
with open(json_file, 'w') as f:
    json.dump(previous_results, f)

In [10]:
# Using tqdm to create a progress bar
for i in tqdm_notebook(range(1, n_loops + 1)):
    # Retrieving the previous results
    with open(json_file, 'r') as f:
        previous_results = json.load(f)
        
    # Calculating the length of previous results to use as an offset
    n_results = len(previous_results)
    
    # API call, retrieving a JSON string
    results = yelpcall.search_query(location = location, term = term, offset = n_results)
    
    # Extending previous results
    previous_results.extend(results['businesses'])
    
    # Saving
    with open(json_file, 'w') as f:
        json.dump(obj = previous_results, fp = f)
        
    # 200 ms Pause
    time.sleep(.2)
    

0it [00:00, ?it/s]

# Converting to dataframe

In [11]:
df = pd.read_json(json_file)
df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,UXHxLN3DcDGI57uDIfCuJA,olds-havana-cuban-bar-and-cocina-miami,Old's Havana Cuban Bar & Cocina,https://s3-media4.fl.yelpcdn.com/bphoto/OyMD-x...,False,https://www.yelp.com/biz/olds-havana-cuban-bar...,2161,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.5,"{'latitude': 25.7655942148975, 'longitude': -8...","[pickup, delivery]",$$,"{'address1': '1442 SW 8th St', 'address2': '',...",17865182196,(786) 518-2196,2093.858301
1,hZm7TunlrksQbgS0ssXbUg,versailles-miami-4,Versailles,https://s3-media1.fl.yelpcdn.com/bphoto/VaSpiR...,False,https://www.yelp.com/biz/versailles-miami-4?ad...,6151,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.0,"{'latitude': 25.765039080853928, 'longitude': ...",[],$$,"{'address1': '3555 SW 8th St', 'address2': '',...",13054440240,(305) 444-0240,3127.739196
2,zGf5kkYEU01NQ04DRYVPNQ,doce-provisions-miami,Doce Provisions,https://s3-media3.fl.yelpcdn.com/bphoto/spQ-Zr...,False,https://www.yelp.com/biz/doce-provisions-miami...,929,"[{'alias': 'gastropubs', 'title': 'Gastropubs'...",4.5,"{'latitude': 25.767862, 'longitude': -80.214191}","[pickup, delivery]",$$,"{'address1': '541 SW 12th Ave', 'address2': No...",17864520161,(786) 452-0161,2162.106175
3,nTsEr_CQqsA8zwLV4kT5nA,cafe-la-trova-miami,Cafe La Trova,https://s3-media3.fl.yelpcdn.com/bphoto/CJenu6...,False,https://www.yelp.com/biz/cafe-la-trova-miami?a...,891,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.5,"{'latitude': 25.7662659, 'longitude': -80.21063}","[pickup, delivery]",$$,"{'address1': '971 SW 8th St', 'address2': '', ...",17866154379,(786) 615-4379,2534.517384
4,gmDna9f57vWTlSrdrEuVEg,sanguich-de-miami-miami-2,Sanguich De Miami,https://s3-media2.fl.yelpcdn.com/bphoto/PUj9xl...,False,https://www.yelp.com/biz/sanguich-de-miami-mia...,1034,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.5,"{'latitude': 25.7656049, 'longitude': -80.2285...",[delivery],$$,"{'address1': '2057 SW 8th St', 'address2': '',...",13055390969,(305) 539-0969,1859.388202


In [12]:
df.tail()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
977,J2vJdu0YSTIhS3Bet39lOw,ball-and-chain-miami-2,Ball & Chain,https://s3-media3.fl.yelpcdn.com/bphoto/ljhxvf...,False,https://www.yelp.com/biz/ball-and-chain-miami-...,1211,"[{'alias': 'musicvenues', 'title': 'Music Venu...",4.0,"{'latitude': 25.765829939936697, 'longitude': ...","[delivery, pickup]",$$,"{'address1': '1513 SW 8th St', 'address2': Non...",13056437820,(305) 643-7820,2047.016043
978,sxEALZeo0_Y4qogJ8FeD2Q,soriano-brothers-cuban-cuisine-miami-4,Soriano Brothers Cuban Cuisine,https://s3-media1.fl.yelpcdn.com/bphoto/9CCo5c...,False,https://www.yelp.com/biz/soriano-brothers-cuba...,11,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.5,"{'latitude': 25.602153896868664, 'longitude': ...",[],,"{'address1': '18005 S Dixie Hwy', 'address2': ...",13054899987,(305) 489-9987,23542.960597
979,YpS6LPBzkk0jIFKauzc0AQ,el-palacio-de-los-jugos-miami-6,El Palacio De Los Jugos,https://s3-media2.fl.yelpcdn.com/bphoto/7AYijA...,False,https://www.yelp.com/biz/el-palacio-de-los-jug...,161,"[{'alias': 'latin', 'title': 'Latin American'}...",4.0,"{'latitude': 25.7950191497803, 'longitude': -8...",[delivery],$$,"{'address1': '2038 NW 27th Ave', 'address2': '...",13056360832,(305) 636-0832,1816.324371
980,83bT4JrOTR6gU8693DXwaQ,marabu-restaurant-miami,Marabu Restaurant,https://s3-media3.fl.yelpcdn.com/bphoto/rfE4a5...,False,https://www.yelp.com/biz/marabu-restaurant-mia...,268,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.0,"{'latitude': 25.76731, 'longitude': -80.1932}","[delivery, pickup]",$$$,"{'address1': '701 S Miami Ave', 'address2': 'F...",17865988012,(786) 598-8012,3900.838115
981,Zu-DqCBsPeGDKNFYEZ8NAg,la-cañita-miami,La Cañita,https://s3-media2.fl.yelpcdn.com/bphoto/Hg82tM...,False,https://www.yelp.com/biz/la-ca%C3%B1ita-miami?...,173,"[{'alias': 'cuban', 'title': 'Cuban'}, {'alias...",4.0,"{'latitude': 25.778931, 'longitude': -80.18819...",[],$$,"{'address1': '401 Biscayne Blvd', 'address2': ...",17864838861,(786) 483-8861,4040.781307


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982 entries, 0 to 981
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             982 non-null    object 
 1   alias          982 non-null    object 
 2   name           982 non-null    object 
 3   image_url      982 non-null    object 
 4   is_closed      982 non-null    bool   
 5   url            982 non-null    object 
 6   review_count   982 non-null    int64  
 7   categories     982 non-null    object 
 8   rating         982 non-null    float64
 9   coordinates    982 non-null    object 
 10  transactions   982 non-null    object 
 11  price          728 non-null    object 
 12  location       982 non-null    object 
 13  phone          982 non-null    object 
 14  display_phone  982 non-null    object 
 15  distance       982 non-null    float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 116.2+ KB
