## Stage two: Gathering Additional Data from Yelp API

In [1]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
import time

In [2]:
#read in our five sets of sample data

#ADAM LUKE:  strive for DRY coding - "don't repeat yourself" :)
#this is what I would do here: 

#num_files = 5 
#file_list = [] 

#for i in range(num_files): 
    #f_name = '250_sample_' + str(i + 1) + '.csv'
    #file_list.append(pd.read_csv(f_name))
    
#file_list[num_files-1].head()
sample1 = pd.read_csv('250_sample_1.csv')
sample2 = pd.read_csv('250_sample_2.csv')
sample3 = pd.read_csv('250_sample_3.csv')
sample4 = pd.read_csv('250_sample_4.csv')
sample5 = pd.read_csv('250_sample_5.csv')
sample5.head()

Unnamed: 0,zip_code,total,non_hispanic_tot,Not Hispanic or Latino householder: - Householder who is White alone,Not Hispanic or Latino householder: - Householder who is Black or African American alone,Not Hispanic or Latino householder: - Householder who is American Indian and Alaska Native alone,Not Hispanic or Latino householder: - Householder who is Asian alone,Not Hispanic or Latino householder: - Householder who is Native Hawaiian and Other Pacific Islander alone,Not Hispanic or Latino householder: - Householder who is Some other race alone,Not Hispanic or Latino householder: - Householder who is Two or more races,...,Id_zip,per_capita_income_2017_estimate,per_capita_income_2017_margin,white_tot,black_tot,indian_tot,asian_tot,native_tot,other_tot,mixed_tot
0,21075,7299,7173,5974,718,26,353,2,7,93,...,8600000US21075,41396.0,1900.0,7299,6041,722,26,356,2,45
1,39341,2880,2859,963,1876,7,2,0,2,9,...,8600000US39341,14771.0,1914.0,2880,970,1885,7,2,0,7
2,55092,3163,3150,3111,7,10,7,0,0,15,...,8600000US55092,34118.0,2206.0,3163,3119,7,11,7,0,3
3,32960,9312,8927,8339,395,17,85,2,4,85,...,8600000US32960,26184.0,3657.0,9312,8605,407,17,86,2,96
4,71129,5057,4963,3140,1729,26,26,1,3,38,...,8600000US71129,24079.0,3216.0,5057,3198,1733,27,27,1,24


In [3]:
#define a list containing all samples so that we can iterate through each file
#this was probably more complicated than it needed to be, and we could have just used one larger sample file

#ADAM LUKE: see block 2, can create this list when reading the data in
allsamples = [sample1, sample2, sample3, sample4, sample5]

In [4]:
#add back the trailing 0's again 
# when Pandas reads in the csv file, it reads the zip as type INT and removes the leading zeroes
# if we do not have the leading zeroes, the yelp search will be incorrect
for sample_data in allsamples:
    sample_data['zip_code'] = sample_data['zip_code'].astype('str')
    fun = lambda x : '0'+ str(x) if len(x) == 4 else ('00'+ str(x) if len(x) == 3 else str(x))
    sample_data['zip_code'] = sample_data['zip_code'].apply(fun)

In [5]:
#Add new Columns to the dataframe. These will be populated with data from each query
# set initial value to zero for all columns

for sample_data in allsamples:
    sample_data['chinese_count'] = 0
    sample_data['ch_tot_review_ct'] = 0
    sample_data['ch_avg_review_ct'] = 0
    sample_data['ch_avg_rating'] = 0
    sample_data['mexican_count'] = 0
    sample_data['mx_tot_review_ct'] = 0
    sample_data['mx_avg_review_ct'] = 0
    sample_data['mx_avg_rating'] = 0    
    sample_data['fastfood_count'] = 0
    sample_data['ff_tot_review_ct'] = 0
    sample_data['ff_avg_review_ct'] = 0
    sample_data['ff_avg_rating'] = 0
    
#ADAM LUKE: Could do this with a nested loop: 

#new_column_list = ['chinese_count','ch_tot_review_ct','ch_avg_review_ct','ch_avg_rating',
                   #'mexican_count','mx_tot_review_ct', 'mx_avg_review_ct', 'mx_avg_rating',
                   #'fastfood_count','ff_tot_review_ct','ff_avg_review_ct', 'ff_avg_rating']

#for sample_data in allsamples: 
#    for column in new_column_list: 
#        sample_data[column] = 0


In [6]:
#Define parameters for API call
# we decided on a radius of 3,000 which is just under two miles
# to get this distance we did some tests to see how many results we would get
from config import api_key
base_url = 'https://api.yelp.com/v3/businesses/search?'
head = {'Authorization': 'Bearer '+ api_key}
params = {
    'categories': '',
    'limit': 50, #max limit is 50
    'radius': 3000
       }

## THE MEGA LOOP
- We created a set of nested 'for' loops to return all of the data we queried

In [8]:
#manually change this parameter for each of the three queries
# we could have made that another loop but we didn't want to blow up our API limits if something went wrong
params['categories'] = 'mexican'


#Use this line if you just want to do for one of the samples. comment out if you want to use the loop
#sample_data = sample1

#using the time library to track how long the query takes
t0 = time.time()

#now begins the mega loop
for sample_data in allsamples:
    #define 'i' variable to count iterations
    i = 1
    #using pandas function 'iterrows', we iterate through each row, passing the index and the row values
    for index,row in sample_data.iterrows():
        #since we know our sample size is 250, we added this line to avoid an endless loop going over our API limit
        
#ADAM LUKE:  hmm. the while loop here is weird. if you want to make sure you are not going over 250 calls per 
# sample, why not just: 
        
        #if i <= 250: 
            #make api calls and store data 
            #i += 1
        #else: 
            #break
            
#this logic limits the number of api calls to 250 per sample data.  It will move on to the next sample in allsamples 
#if for some reason one of them contains more than 250 rows.  Putting API calls inside a while loop can be risky

            
        while i <= 250:
            i +=1
            #change the location parameter to the zip code of the current row
            params['location'] = row['zip_code']
            #get the JSON output of the API call
            
#ADAM LUKE:  API requests should always be in try/except loops in order to except situations when their server goes 
             #down for maitenance or whatever
    
            output = requests.get(base_url, params, headers=head).json()
            #reset variables to zero
            tot_review_count = 0
            tot_rating = 0
            tot_count = 0
            #total up the results of review count and rating from the first page of results
            for restaurant in output['businesses']:
                #Yelp includes a handy 'total' value in the results
                #this indicates the total number of restaurants that match the parameters
                tot_count = output['total']
                tot_review_count += restaurant['review_count']
                tot_rating += restaurant['rating']
            #if the results span more than one page, we need to iterate through each page to tabulate the results
            if tot_count <= params['limit']:
                    #if the total results is less than the page limit, we simply pass this part of the loop
                    pass
            else:
                #reset the 'num' variable to the number of the results per page
                num = params['limit']
                while num < tot_count:
                    #the 'offset' parameter allows us to show, e.g. 'next 50'
                    params['offest'] = num
                    #perform another API call for the next 50 results
                    output = requests.get(base_url, params, headers=head).json()
                    for restaurant in output['businesses']:
                        tot_review_count += restaurant['review_count']
                        tot_rating += restaurant['rating']
                    num += params['limit']
            #using a try/except block because some of the values we are calculating an average review
            # this will sometimes pass a divide by zero error 
            try:
                sample_data.loc[index,'mexican_count'] = output['total']
                sample_data.loc[index,'mx_tot_review_ct'] = tot_review_count
                sample_data.loc[index,'mx_avg_review_ct'] = round(tot_review_count/tot_count,2)
                sample_data.loc[index,'mx_avg_rating'] = round(tot_rating/tot_count,2)
            except ZeroDivisionError:
                #print a line any time there is a divbyzero just to keep track of progress
                print(f'iteration {i} division by zero')
            #break applies to the while i <= 250 above    
            break

#print the total elapsed time for each 250 sample
print(f'run time = {time.time() - t0}')

iteration 2 division by zero
run time = 3.3680191040039062


### Repeat the analysis for the two other restaurant types

In [56]:
#Repeating yourself here makes sense if you want to make sure everything is working before moving on 
# to the next category.  Another thing you could do is save you dataframe after each category in case you get an 
#error.  That way you wouldn't have to start over if something goes wrong


params['categories'] = 'chinese'
#Use this line if you just want to do for one of the samples. comment out if you want to use the loop
#sample_data = sample1


for sample_data in allsamples:
    i = 1
    for index,row in sample_data.iterrows():
        while i <= 250:
            i +=1
            params['location'] = row['zip_code']
            output = requests.get(base_url, params, headers=head).json()
            tot_review_count = 0
            tot_rating = 0
            tot_count = 0
            for restaurant in output['businesses']:
                tot_count = output['total']
                tot_review_count += restaurant['review_count']
                tot_rating += restaurant['rating']
            if tot_count <= params['limit']:
                    pass
            else:
                num = params['limit']
                while num < tot_count:
                    params['offest'] = num
                    output = requests.get(base_url, params, headers=head).json()
                    for restaurant in output['businesses']:
                        tot_review_count += restaurant['review_count']
                        tot_rating += restaurant['rating']
                    num += params['limit']
            try:
                sample_data.loc[index,'chinese_count'] = output['total']
                sample_data.loc[index,'ch_tot_review_ct'] = tot_review_count
                sample_data.loc[index,'ch_avg_review_ct'] = round(tot_review_count/tot_count,2)
                sample_data.loc[index,'ch_avg_rating'] = round(tot_rating/tot_count,2)
            except ZeroDivisionError:
                print('division by zero')
            break

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

In [73]:
#'hotdogs' is the yelp category name for Fast Food
params['categories'] = 'hotdogs'
#Use this line if you just want to do for one of the samples. comment out if you want to use the loop
#sample_data = sample1

t0 = time.time()

for sample_data in allsamples:
    i = 1
    for index,row in sample_data.iterrows():
        while i <= 250:
            i +=1
            params['location'] = row['zip_code']
            output = requests.get(base_url, params, headers=head).json()
            tot_review_count = 0
            tot_rating = 0
            tot_count = 0
            for restaurant in output['businesses']:
                tot_count = output['total']
                tot_review_count += restaurant['review_count']
                tot_rating += restaurant['rating']
            if tot_count <= params['limit']:
                    pass
            else:
                num = params['limit']
                while num < tot_count:
                    params['offest'] = num
                    output = requests.get(base_url, params, headers=head).json()
                    for restaurant in output['businesses']:
                        tot_review_count += restaurant['review_count']
                        tot_rating += restaurant['rating']
                    num += params['limit']
            try:
                sample_data.loc[index,'fastfood_count'] = output['total']
                sample_data.loc[index,'ff_tot_review_ct'] = tot_review_count
                sample_data.loc[index,'ff_avg_review_ct'] = round(tot_review_count/tot_count,2)
                sample_data.loc[index,'ff_avg_rating'] = round(tot_rating/tot_count,2)
            except ZeroDivisionError:
                pass
            break
    print(f'time elapsed = {time.time() - t0}')
            

print(f'Total run time = {time.time() - t0}')

time elapsed = 140.72525906562805
time elapsed = 283.5190510749817
time elapsed = 421.7944321632385
time elapsed = 558.2060701847076
time elapsed = 692.1128251552582
Total run time = 692.1129972934723


In [75]:
sample4.head(10)

Unnamed: 0,zip_code,total,non_hispanic_tot,Not Hispanic or Latino householder: - Householder who is White alone,Not Hispanic or Latino householder: - Householder who is Black or African American alone,Not Hispanic or Latino householder: - Householder who is American Indian and Alaska Native alone,Not Hispanic or Latino householder: - Householder who is Asian alone,Not Hispanic or Latino householder: - Householder who is Native Hawaiian and Other Pacific Islander alone,Not Hispanic or Latino householder: - Householder who is Some other race alone,Not Hispanic or Latino householder: - Householder who is Two or more races,...,ch_avg_review_ct,ch_avg_rating,mexican_count,mx_tot_review_ct,mx_avg_review_ct,mx_avg_rating,fastfood_count,ff_tot_review_ct,ff_avg_review_ct,ff_avg_rating
0,58078,5813,5754,5650,16,45,10,2,3,28,...,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0
1,78602,6279,5572,4805,633,36,36,1,3,58,...,54.0,2.5,14,365,26.07,3.21,12,195,16.25,2.46
2,29180,5521,5470,2509,2929,2,9,0,2,19,...,4.0,3.5,1,2,2.0,2.5,1,1,1.0,1.0
3,70062,6604,6171,3307,2706,28,57,1,6,66,...,34.75,3.12,15,678,45.2,3.03,25,345,13.8,2.26
4,62208,5939,5856,4824,896,9,71,0,2,54,...,28.2,3.4,5,209,41.8,3.6,17,300,17.65,3.0
5,11238,21104,19154,3813,13814,47,714,17,112,637,...,109.86,4.38,96,24798,258.31,4.06,74,3152,42.59,3.93
6,60645,16136,14350,10066,1638,27,1974,8,77,560,...,87.07,3.1,44,2869,65.2,3.69,40,1234,30.85,2.75
7,29902,9098,8847,6733,1935,21,70,10,3,75,...,0.0,0.0,1,36,36.0,4.0,0,0,0.0,0.0
8,78758,18024,13844,10136,1992,38,1338,8,33,299,...,271.71,3.61,63,13992,222.1,6.13,29,1236,42.62,2.79
9,44136,16209,16075,15329,207,6,429,1,1,102,...,32.67,3.0,7,179,25.57,3.14,15,234,15.6,3.07


In [74]:
#save the files with the appended columns back to .csv
sample1.to_csv('250_sample_1_Results.csv', index=False)
sample2.to_csv('250_sample_2_Results.csv', index=False)
sample3.to_csv('250_sample_3_Results.csv', index=False)
sample4.to_csv('250_sample_4_Results.csv', index=False)
sample5.to_csv('250_sample_5_Results.csv', index=False)