In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import math
import time
from yelpapi import YelpAPI as YALP
from tqdm.notebook import tqdm_notebook as tqn

In [2]:
# Check if required packages are installed, if not, install them
!pip install yelpapi
!pip install tqdm

print("Imports done")

Imports done


In [3]:
# Load API Credentials
with open("/Users/casta/.secret/yelp_api.json","r") as f:
    login = json.load(f)
print("API credentials loaded")

API credentials loaded


In [4]:
# Instantiate Yelp API object
yelp = YALP(login['api-key'], timeout_s=5.0)
print("YelpAPI object created")

# Define search terms and file paths
location = "San Francisco, CA"
term = "sushi"
loc = location.split(",")[0]
print("Location:", loc)

YelpAPI object created
Location: San Francisco


In [5]:
# Specify folder for saving data
folder = "Data/"
os.makedirs(folder, exist_ok=True)
JSON_FILE = folder + f"{loc}-{term}.json"
print("Directory file created:", JSON_FILE)

# Check if JSON file exists, if not, create an empty file
if not os.path.isfile(JSON_FILE):
    print(f"[i] {JSON_FILE} not found. Saving empty list to file.")
    with open(JSON_FILE, "w") as f:
        json.dump([], f)
else:
    print(f"[i] {JSON_FILE} already exists.")

Directory file created: Data/San Francisco-sushi.json
[i] Data/San Francisco-sushi.json already exists.


In [6]:
# Load JSON file and account for previous results
with open(JSON_FILE, "r") as f:
    data = json.load(f)
    results_per_page = len(data)
    n_pages = math.ceil(results_per_page / 50) # Assuming 50
    print(f"Loaded {results_per_page} results from {n_pages}")

# API call to get the first page data
results = yelp.search_query(term=term, location=location)
total_results = results["total"]
results_per_page = len(results["businesses"])
n_pages = math.ceil(total_results / results_per_page)

Loaded 0 results from 0


In [9]:
# API call to get the first page data
results = yelp.search_query(term=term, location=location)
total_results = results["total"]
results_per_page = 10
n_pages = math.ceil(total_results / results_per_page)
print(n_pages)

# Loop through remaining pages to extract all results
for page in range(1, 6):
    time.sleep(1)  # Pause to avoid overwhelming the server
    results = yelp.search_query(term=term, location=location, offset=page * results_per_page)
    data.extend(results["businesses"])
    print(f"Page {page + 1} - Extracted {len(results['businesses'])} results")

# Save the updated data to the JSON file
with open(JSON_FILE, "w") as f:
    json.dump(data, f)
print(f"Total of {len(data)} results saved to {JSON_FILE}")

# Compile the results into a dataframe
df = pd.DataFrame(data)
print("Dataframe created:")
print(df.head())

140
Page 2 - Extracted 20 results
Page 3 - Extracted 20 results
Page 4 - Extracted 20 results
Page 5 - Extracted 20 results
Page 6 - Extracted 20 results
Total of 3040 results saved to Data/San Francisco-sushi.json
Dataframe created:
                       id                                   alias  \
0  ba82pLmI2zFk0ayiCQoPJQ                 sushi-uma-san-francisco   
1  YZrl7W2dewBb-T7e-BxAkQ                    nara-san-francisco-5   
2  l69Z6OMNKxrwA69IubLS5g  okaeri-japanese-bistro-san-francisco-3   
3  kjckqShnIlDeraGJM3Nw4w              o-toro-sushi-san-francisco   
4  4GAsfSjaB1XewzS3TKOtTg                       koo-san-francisco   

                     name                                          image_url  \
0               Sushi Uma  https://s3-media2.fl.yelpcdn.com/bphoto/7HcvVu...   
1                    Nara  https://s3-media1.fl.yelpcdn.com/bphoto/bY-Sms...   
2  Okaeri Japanese Bistro  https://s3-media3.fl.yelpcdn.com/bphoto/ma2NK0...   
3            O-Toro Sushi  http