In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import math
import time
from yelpapi import YelpAPI as YALP
from tqdm.notebook import tqdm_notebook as tqn

In [2]:
# Check if required packages are installed, if not, install them
!pip install yelpapi
!pip install tqdm

print("Imports done")

Imports done


In [3]:
# Load API Credentials
with open("/Users/casta/.secret/yelp_api.json","r") as f:
    login = json.load(f)
print("API credentials loaded")

API credentials loaded


In [4]:
# Instantiate Yelp API object
yelp = YALP(login['api-key'], timeout_s=5.0)
print("YelpAPI object created")

# Define search terms and file paths
location = "Seattle, WA 98122"
term = "pizza"
loc = location.split(",")[0]
print("Location:", loc)

YelpAPI object created
Location: Seattle


In [5]:
# Specify folder for saving data
folder = "Data/"
os.makedirs(folder, exist_ok=True)
JSON_FILE = folder + f"{loc}-{term}.json"
print("Directory file created:", JSON_FILE)

# Check if JSON file exists, if not, create an empty file
if not os.path.isfile(JSON_FILE):
    print(f"[i] {JSON_FILE} not found. Saving empty list to file.")
    with open(JSON_FILE, "w") as f:
        json.dump([], f)
else:
    print(f"[i] {JSON_FILE} already exists.")

Directory file created: Data/Seattle-pizza.json
[i] Data/Seattle-pizza.json already exists.


In [6]:
# Load JSON file and account for previous results
with open(JSON_FILE, "r") as f:
    data = json.load(f)
    results_per_page = len(data)
    n_pages = math.ceil(results_per_page / 50)  # Assuming 50 results per page
    print(f"Loaded {results_per_page} results from {n_pages} pages")

# API call to get the first page data
results = yelp.search_query(term=term, location=location)
total_results = results["total"]
results_per_page = len(results["businesses"])
n_pages = math.ceil(total_results / results_per_page)

Loaded 0 results from 0 pages


In [7]:
# Loop through remaining pages to extract all results
for page in range(1, n_pages):
    time.sleep(1)  # Pause to avoid overwhelming the server
    results = yelp.search_query(term=term, location=location, offset=page * results_per_page)
    data.extend(results["businesses"])
    print(f"Page {page + 1} - Extracted {len(results['businesses'])} results")

# Save the updated data to the JSON file
with open(JSON_FILE, "w") as f:
    json.dump(data, f)
print(f"Total of {len(data)} results saved to {JSON_FILE}")

# Compile the results into a dataframe
df = pd.DataFrame(data)
print("Dataframe created:")
print(df.head())

Page 2 - Extracted 20 results
Page 3 - Extracted 20 results
Page 4 - Extracted 20 results
Page 5 - Extracted 20 results
Page 6 - Extracted 20 results
Page 7 - Extracted 20 results
Page 8 - Extracted 20 results
Page 9 - Extracted 20 results
Page 10 - Extracted 20 results
Page 11 - Extracted 20 results
Page 12 - Extracted 20 results
Page 13 - Extracted 20 results
Page 14 - Extracted 20 results
Page 15 - Extracted 20 results
Page 16 - Extracted 20 results
Page 17 - Extracted 20 results
Page 18 - Extracted 20 results
Page 19 - Extracted 20 results
Page 20 - Extracted 20 results
Page 21 - Extracted 20 results
Page 22 - Extracted 20 results
Page 23 - Extracted 20 results
Page 24 - Extracted 20 results
Page 25 - Extracted 20 results
Page 26 - Extracted 20 results
Page 27 - Extracted 20 results
Page 28 - Extracted 20 results
Page 29 - Extracted 20 results
Page 30 - Extracted 20 results
Page 31 - Extracted 20 results
Page 32 - Extracted 20 results
Page 33 - Extracted 20 results
Page 34 - Extrac