# Scraping using Yelp API

- Import required Libraries

In [2]:
import requests
import json
import concurrent.futures
import pandas as pd
import datetime

- Collecting list of categories stored in categories.json available on Yelp

In [3]:
with open("categories.json", "r") as f:
    categories = json.load(f)

In [4]:
cat_list = [] # List of available cuisines on Yelp
for category in categories:
    if 'restaurants' in category['parents']:
        cat_list.append(category['title'])

In [5]:
print(cat_list)

['Afghan', 'African', 'Andalusian', 'Arabic', 'Argentine', 'Armenian', 'Asian Fusion', 'Asturian', 'Australian', 'Austrian', 'Baguettes', 'Bangladeshi', 'Basque', 'Bavarian', 'Barbeque', 'Beer Garden', 'Beer Hall', 'Beisl', 'Belgian', 'Bistros', 'Black Sea', 'Brasseries', 'Brazilian', 'Breakfast & Brunch', 'British', 'Buffets', 'Bulgarian', 'Burgers', 'Burmese', 'Cafes', 'Cafeteria', 'Cajun/Creole', 'Cambodian', 'Canteen', 'Caribbean', 'Catalan', 'Cheesesteaks', 'Chicken Wings', 'Chicken Shop', 'Chilean', 'Chinese', 'Comfort Food', 'Corsican', 'Creperies', 'Cuban', 'Curry Sausage', 'Cypriot', 'Czech', 'Czech/Slovakian', 'Danish', 'Delis', 'Diners', 'Dinner Theater', 'Dumplings', 'Eastern European', 'Parent Cafes', 'Eritrean', 'Ethiopian', 'Filipino', 'Fischbroetchen', 'Fish & Chips', 'Flatbread', 'Fondue', 'Food Court', 'Food Stands', 'Freiduria', 'French', 'Galician', 'Game Meat', 'Gastropubs', 'Georgian', 'German', 'Giblets', 'Gluten-Free', 'Greek', 'Guamanian', 'Halal', 'Hawaiian', 

In [202]:
url_list = [] # create urls to scrape
for cuisine in cat_list:
    for i in range(0,1000,50):
        url = f"https://api.yelp.com/v3/businesses/search?location=manhattan&term={cuisine} Restaurant&limit=50&offset={i}"
        url_list.append(url)

- Helper Function to scrape restaurant content from the url endpoint

In [214]:
def get_content(url):
    header = {'Authorization':'Bearer xjj5l47CPhQjXaNFDJZeaD_nAtpW164dKOBlpf2527IzAR9CAQZ5qAw5kQ-E_1rSkWzeepJBCO0l4wL1HghuOk0PdKrxC2VoM20X2jlDTulHmfBOjumPzBp-NUdDY3Yx'}
    response = requests.get(url, headers = header)
    if response.status_code == 200:
        content = response.json()
        for result in content["businesses"]:
            data.append(result)

In [217]:
data = [] # data stores scraped data
# multithreading to concurrently collecting the data
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(get_content, url_list)

- Finally, store the data in json format

In [218]:
with open('test_scrape.json', 'w') as file:
    json.dump(data,file)

## Data Cleaning

In [254]:
df = pd.read_json('test_scrape.json')
print ("Size of dataset with duplicates:",df.shape) 

Size of dataset with duplicates: (48379, 16)


In [256]:
df = df.drop_duplicates(subset=['id'])
print ("Size of dataset after dropping duplicates:",df.shape) 

Size of dataset after dropping duplicates: (17343, 16)


- Helper Function to collect all cuisine tags associated with a restaurant

In [274]:
def get_cuisinetags(categories:list) -> list:
    cuisine = [] # stores cuisine tags associated with a restaurant
    for category in categories:
        cuisine.append(category['title'])
    return cuisine

In [275]:
# Get cuisine tags for all restaurants
df['cuisine_tags'] = df['categories'].apply(lambda x : get_cuisinetags(x))

In [280]:
df['insertedAtTimestamp'] = datetime.datetime.now()

In [281]:
df = df[['id','cuisine_tags','name','location','coordinates','review_count','rating','insertedAtTimestamp']]

In [285]:
df.to_json('Clean_Data.json',orient='records')