In [68]:
# imports
import pandas as pd
import os # use this to access your environment variables
import requests # this will be used to call the APIs
import json

In [5]:
# From city bikes workbook
path = '/Users/parkerharalds/Documents/Lighthouse/Statistical_Modelling_Project/data/city_bikes.csv'
df = pd.read_csv(path)

In [6]:
df.shape

(258, 7)

In [7]:
# smaller city bikes dataframe used for testing
df2 = df.head(2)
df2

Unnamed: 0,city,station_id,station_name,latitude,longitude,free_bikes,empty_slots
0,Vancouver,00fa94ad698dc4a9e4d708d6fd32f294,Chilco & Barclay,49.291909,-123.140713,11,7
1,Vancouver,012d3e06901cc222b1c2cf0a2ace3a29,St George & Broadway,49.262321,-123.09306,1,13


# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

**Function to make API calls based on latitude and longitude from city_bikes.csv:**

In [9]:
api_key = os.environ['FS_API_KEY']
headers = {"accept": "application/json",
           'Authorization': api_key}
url ='https://api.foursquare.com/v3/places/search?'

def get_nearby_places(lat, lon):
    filters = {  'll': f'{lat},{lon}',            #lat and lon to same string
                'radius': 1000,                  # Radius in meters (specified above)  
                'categories': 13032,       # code for coffee shops and cafes
                'limit': 50              
            }                 
    response = requests.get(url, headers= headers, params= filters)
    if response.status_code == 200:             #if api call was successfull return the results
        data = response.json()
        results = data['results']
        df_results = pd.json_normalize(results)
        return df_results
    else:
        print('Error')
        return None

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

Put your parsed results into a DataFrame

In [10]:
# Iterate through the DataFrame and make API calls to foursquare api
df_all_stations = pd.DataFrame()
for index, row in df.iterrows():
    lat, lon = row['latitude'], row['longitude']
    df_this_station = get_nearby_places(lat, lon)
    
    if df_this_station is not None:
        df_this_station['station'] = row['station_id']
        df_all_stations = pd.concat([df_all_stations, df_this_station], axis=0)

# df_all_stations

In [11]:
fsq_df = df_all_stations
fsq_df.shape

In [14]:
#saving as csv to use for rest of project
fsq_df.to_csv('fsq_df.csv',index=False,sep=',')

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

Test api call:

In [22]:
yelp_api_key = os.environ['yelp_api']
y_headers = {"accept": "application/json",
           'Authorization': f'Bearer {yelp_api_key}'
}
filters = {     'latitude': 48.4, 
                'longitude': 123.4,           
                'radius': 1000     
            }      
y_url = f'https://api.yelp.com/v3/businesses/search?'
y_response = requests.get(y_url, headers=y_headers, params=filters)

In [156]:
testing = y_response.json()
testing['businesses'] #will use this for json_normalize in function

Yelp API call (limit 300 calls per day):

Should be able to run entire ciry bikes df as it only has 258 rows.

In [23]:
yelp_api_key = os.environ['yelp_api']
y_headers = {"accept": "application/json",
           'Authorization': f'Bearer {yelp_api_key}'
           }
y_url = f'https://api.yelp.com/v3/businesses/search?'

def get_yelp_places(lat, lon):
    filters = { 'latitude': lat, 
                'longitude': lon,           
                'radius': 1000,                    
                'categories': 'cafes',      
            }                 
    y_response = requests.get(y_url, headers=y_headers, params=filters)
    if y_response.status_code == 200:             
        data = y_response.json()
        results = data['businesses']
        df_results = pd.json_normalize(results)
        return df_results
    else:
        print('Error')
        return None


Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

Put your parsed results into a DataFrame

In [24]:
# Iterate through the DataFrame and make API calls to yelp fusion api
df_yelp_stations = pd.DataFrame()
for index, row in df.iterrows():
    lat = row['latitude']
    lon =  row['longitude']
    df_this_yelp = get_yelp_places(lat, lon)
    
    if df_this_yelp is not None:
        df_this_yelp['station'] = row['station_id']
        df_yelp_stations = pd.concat([df_yelp_stations, df_this_yelp], axis=0)
df_yelp_stations = df_yelp_stations.reset_index(drop=True)

# df_yelp_stations

In [25]:
yelp_df = df_yelp_stations

yelp_df.shape

(3864, 30)

In [26]:
#storing csv to use later 
yelp_df.to_csv('yelp_df.csv',index=False,sep=',')

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

**Yelp:**

In [None]:
yelp_nulls = yelp_df.isna().any()
yelp_nulls

In [42]:
print('The shape is:\n', yelp_df.shape,
      '\nThe columns are:\n', yelp_df.columns
      )

The shape is:
 (3864, 30) 
The columns are:
 Index(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count',
       'categories', 'rating', 'transactions', 'phone', 'display_phone',
       'distance', 'business_hours', 'coordinates.latitude',
       'coordinates.longitude', 'location.address1', 'location.address2',
       'location.address3', 'location.city', 'location.zip_code',
       'location.country', 'location.state', 'location.display_address',
       'attributes.business_temp_closed', 'attributes.menu_url',
       'attributes.open24_hours', 'attributes.waitlist_reservation', 'price',
       'station'],
      dtype='object')


In [None]:
# yelp_df['review_count']

**FourSquare:**

In [None]:
fsq_df

In [None]:
fsq_nulls = fsq_df.isna().any()
fsq_nulls



In [56]:
print('The shape is:\n',fsq_df.shape,
      '\nThe columns are:\n',fsq_df.columns
      )

The shape is:
 (10660, 30) 
The columns are:
 Index(['fsq_id', 'categories', 'chains', 'closed_bucket', 'distance', 'link',
       'name', 'timezone', 'geocodes.drop_off.latitude',
       'geocodes.drop_off.longitude', 'geocodes.main.latitude',
       'geocodes.main.longitude', 'geocodes.roof.latitude',
       'geocodes.roof.longitude', 'location.address', 'location.country',
       'location.cross_street', 'location.formatted_address',
       'location.locality', 'location.postcode', 'location.region',
       'location.address_extended', 'related_places.children', 'station',
       'related_places.parent.fsq_id', 'related_places.parent.categories',
       'related_places.parent.name', 'geocodes.front_door.latitude',
       'geocodes.front_door.longitude', 'location.po_box'],
      dtype='object')


Both apis have columns with null values; however, in both cases the columns are columns that I do not plan on using in this project. Most are seccondary pieces of information that may be nice to have in the future, but are not relevant now.

The foursquare dataframe contains many more rows, but the data in the Yelp dataframe is more in line with how I plan to use the data moving forward. It contains information regarding the review count and rating which I plan to use to build a model later on.

Get the top 10 restaurants according to their rating

In [64]:
top_ten = yelp_df.sort_values('rating',ascending=False).head(10)

In [None]:
top_ten

These restaurants all have a very small number of ratings. Repeating the question for restaurant with more than 10 ratings:

In [66]:
true_top_ten = yelp_df[yelp_df['review_count']>=10].sort_values('rating',ascending=False).head(10)

In [None]:
true_top_ten

The true top ten has many restaurants tied at 4.8. This could be further explored if given more time. In the future, weighting review_count and rating together could give a better overall 'rating' score.