# Yelp API Query

Input requirements: Yelp API developers key, list of zip codes to query 

Output files: csv query results (create folder dedicated to storing .csv files)

In [1]:
import requests
import json
import numpy as np
import pandas as pd

def yelp_api_request(term, location, query_number):
    """term = 'nightlife' (string)
        location = 'zipcode', 'neighborhood' (string)
        query_number = round of query (1,2,3...) -> determines offset (integer)
    """
    offset = (query_number - 1)*50 + 1 
    #api_key= YOUR API KEY HERE
    url='https://api.yelp.com/v3/businesses/search'
    params={'term': term, 'location': location, 'limit': 50, 'offset': offset}
    headers = {'Authorization': 'Bearer %s' % api_key} 
    req = requests.get(url, params=params, headers=headers)
    json_data = req.json()
    df = pd.DataFrame(json_data["businesses"])
    business_id = list(df['id'])
    reviews_dataframe = pd.DataFrame()
    list_of_reviews = []
    new = pd.DataFrame(index=business_id, columns = ['rev1', 'rev2', 'rev3','hours'])
    for id_ in business_id: #both businesses hours and reviews are queried by business ID
        
        # business hours
        url_hours = "https://api.yelp.com/v3/businesses/" + id_ 
        req_hours = requests.get(url_hours, headers=headers)
        json_hour_data = req_hours.json()
        new.loc[id_]['hours'] = json_hour_data.get('hours')
       
        # reviews
        list_of_reviews_for_each_business =[]
        url_reviews="https://api.yelp.com/v3/businesses/" + id_ + "/reviews"
        req_reviews = requests.get(url_reviews, headers=headers)

        try:
            if req_reviews.json()['reviews'][0]['text']!= None and req_reviews.json()['reviews'][0]['text'] != '' :
                list_of_reviews_for_each_business.append(req_reviews.json()['reviews'][0]['text'])
        except:
            list_of_reviews_for_each_business.append('No Review')

        try:
            if req_reviews.json()['reviews'][1]['text']!= None and req_reviews.json()['reviews'][1]['text'] != '' :
                list_of_reviews_for_each_business.append(req_reviews.json()['reviews'][1]['text'])
        except:
            list_of_reviews_for_each_business.append('No Review')

        try:
            if req_reviews.json()['reviews'][2]['text']!= None and req_reviews.json()['reviews'][2]['text'] != '' :
                list_of_reviews_for_each_business.append(req_reviews.json()['reviews'][2]['text'])
        except:
            list_of_reviews_for_each_business.append('No Review')

        list_of_reviews.append(list_of_reviews_for_each_business)

        new.loc[id_]['rev1'] = list_of_reviews_for_each_business[0]
        new.loc[id_]['rev2'] = list_of_reviews_for_each_business[1]
        new.loc[id_]['rev3'] = list_of_reviews_for_each_business[2]
    

    # merge business info and reviews dataframe on business ID
    df_merged = df.merge(new, left_on = 'id', right_index = True)

    file_name = '{}_{}_{}_yelp.csv'.format(term, location, query_number)


    df_merged.to_csv(' '+file_name) # local folder to contain .csv query results

### Notes - Query Procedure:

Each Yelp API key retrieves a maximum of 1,000 entries per day. Thus, multiple API keys must be used and/or querying must be spaced out across several days. 

Each Yelp API query will return 50 results per request. Thus, each zip code will require multiple queries to be performed.

To reduce the number of queries performed, subset the 'all_zipcodes' list into smaller chunks located in 'zipcode_list'.

An error will be returned (zip code and query 'page' number) once there are no remaining venues to query for a specific zip code. At this point, the next zipcode listed in 'zipcode_list' will begin to query.


In [7]:
all_zipcodes = ['11101','11102','11103','11105','11106','11104','11109','11120','11206','11211',\
                '11249','11207','11221','11237','10026','10027','10037','10030','10039','10001','10011',\
                '10018','10019','10020','10036','10002','10034','10040','10454','10455','10459','10474','11205',\
                '11216','11233','11238','11385','11386','10109','11373','11379','11372','11354','11355','11358','11222']

zipcode_list = [] # subset of zip codes above

for zc in zipcode_list:
    for i in range(1,21):
        try:
            yelp_api_request('nightlife', zc, i)
        except:
            print("error - zip: {}, page number: {}".format(zc, i))

error - zip: 11222, page number: 20


In [3]:
# To query one page of one individual zip code (instead of several), run this cell instead:

yelp_api_request('nightlife', ZIPCODE (string), PAGENUMBER (integer))