# Feeding city_bikes.ipynd Latitude and Longitude Values Into YELP API

In [34]:
import os
import pandas as pd
import requests
from dotenv import load_dotenv
import json

In [35]:
# loading the .env file
load_dotenv()

True

In [36]:
# assigning stored api_key in .env file to vairables
api_key = os.environ.get('YELP4')
print(api_key)

ywTEe6T5IJYriLN2Wv00bx8T012Xk9JGSA4xXxQaS6cyyU1Gk7KmArPH3P4ObwHLU4i5A6qt6FFJHwVc9NaY3zSiQkWSm-M0LkhwwpQ2A5HId_TfAtAjGAG_fT00ZXYx


In [37]:
# Setting up base_url for YELP
url='https://api.yelp.com/v3/businesses/search'

# Headers for authorization (api key)
headers = {
    'Authorization': 'Bearer %s' % api_key
}

In [38]:
# reading in the CityBikes - Toronto Bike Share csv from city_bikes.ipynb
cityBikes = pd.read_csv('BikeShareToronto.csv')
cityBikes

# Pulling all bike stations in toronto, we get 717 stations in total, with their respective lat/lon's and the availibility of free bikes.

Unnamed: 0.1,Unnamed: 0,latitude,longitude,free_bikes
0,0,43.665269,-79.319796,18
1,1,43.671420,-79.445947,2
2,2,43.666224,-79.317693,10
3,3,43.653236,-79.376716,4
4,4,43.663722,-79.380288,23
...,...,...,...,...
712,712,43.601915,-79.499419,1
713,713,43.642852,-79.531995,1
714,714,43.650945,-79.379498,0
715,715,43.707356,-79.437231,0


### Instead of calculating Euclidean distance, comparing each latitude/longitude of bars/resturants/libraries to each latitude/longitude of each bike station, instead we're going to feed the latitudes/longitudes of each bike stop into the YELP API call, adjusting the radius to 1000m (1km) and setting the limit to the maximum of 50 - this will give us the total number of POI's (depending on what our search category is) within a 1k radius of each bike station.

In [39]:
# setting up lat and lon values for cityBike stations
lat = cityBikes['latitude'].values # will give a numpy array of each bike station latitude
lon = cityBikes['longitude'].values
len_cityBikes = len(cityBikes) # will give the number of bike stations - were going to use this length/value below when we iterate using a for-loop.

In [40]:
# intitialising lists for number of bars and libs and the minimum distance to the closest bar and lib within a 1000m (1km) radius of each bike stand. 
# These will ulitmately be added to to cityBikes dataFrame and used for modelling and EDA. The goal here is to rid away with joinig tables with pandas or SQL and essentially,
# perform everything uptream. This method ensure data validity and integrity - ensuring no duplicates and issues that come about with joining on keys etc..

num_of_bars = [] # number of bars within 1km of bike station
avg_bar_rating = [] # average bar rating for number of bars in near given ike station
num_of_libs = [] # same as above but for libraries
avg_lib_rating = [] # average lib rating for number of libs in near given bike station
min_bars_dist = [] # distance of closest bar to bike station in meters
min_libs_dist = [] # || for libraries


# Unlike with the Foursquare API (where the rate limit is much higher than 500), were only grabing first 250 lat/long values from BikeShareToronto.csv. Given that we're querying the API twice, once for bars and once
# for library's, we make two calls per bike station, and with a rate limit of 500, we can only query 250 bike stations. This will prove to be a bottleneck and will give favour to the foursquare API.
for k in range(0, 250): 
    ll = f'{lat[k]},{lon[k]}' # this is concatinating the lat and lon of each bikeStation which will be fed into the Fouraquare API.

    # finding bars
    params = {
        'term':'bars',
        'location': ll,
        'radius': 1000, # bars withing 1000m or (1km) of each bike station
        'limit': 50 # max limit of results
    }
    
    response = requests.get(url, params=params, headers=headers)

    data = response.json()

    num_of_bars.append(len(data['businesses'])) # appending number of bars (within a 1km distance) for every bike station to the above created list

    # Appending the bars' distance, more specifically the one with the closest or minimum distance for every bike station.
    # error handling incase ['distance'] may not exist.
    try:
        min_bars_dist.append(round(pd.json_normalize(data['businesses'])['distance'].min(), 2)) # appending bar with minimum distance of bike station
    except KeyError:
        min_bars_dist.append(None)

    # Extract ratings from businesses - unlike in the foursquare API, the YELP API provides users with ratings for bars. So along with the number of bars for every bike station, the average rating of those bars
    # will be calcualted and be present as a column in the final DataFrame.
    ratings = [business.get('rating', None) for business in data['businesses']]
    # Filter out None values
    valid_ratings = [rating for rating in ratings if rating is not None]

    if valid_ratings:  # Check if there are valid ratings
        avg_rating = sum(valid_ratings) / len(valid_ratings)
    else:
        avg_rating = None # will give 0 incase no ratings exist
    
    avg_bar_rating.append(avg_rating) # appending the average rating to the 'avg_bar_rating' list.
    
    # Like for bars, we're searching for librarys now.
    params = {
        'term':'library',
        'location': ll, # will be feeding in bike station lat/longs from BikeShareToronto.csv
        'radius': 1000, # library's withing 1000m or (1km) of each bike station
        'limit': 50 
    }

    response = requests.get(url, params=params, headers=headers)

    data = response.json()

    
    num_of_libs.append(len(data['businesses']))  # appending number of library's for every bike station (within a 1km distance) to the above created list

    # Error handling incase ['distance'] may not exist. extracting the bar with the shortest distance for every bike station.
    try:
        min_libs_dist.append(round(pd.json_normalize(data['businesses'])['distance'].min(), 2))
    except KeyError:
        min_libs_dist.append(None)

     # Extract ratings from businesses
    ratings = [business.get('rating', None) for business in data['businesses']]
    # Filter out None values
    valid_ratings = [rating for rating in ratings if rating is not None]

    if valid_ratings:  # Check if there are valid ratings
        avg_rating = sum(valid_ratings) / len(valid_ratings)
    else:
        avg_rating = None
    
    avg_lib_rating.append(avg_rating)


In [41]:
pd.json_normalize(data['businesses'])

In [48]:
# Using the above created, here we're building out the DataFrame - 'bike_station#', 'latitude', 'longitude' and 'free_bikes' come from the BikeShareToronto.csv but 'num_of_bars'
# 'min_bar_dist', 'num_of_libs' and 'min_libs_dist' were created from the foursquare API.

df_bikes_bars_libs = cityBikes.iloc[:250,:]

# creating new columns for number of bar and libraries within 1km radius and the bar and lib with minimum distance to bike station as well as average rating of bars and libraries.
df_bikes_bars_libs['num_of_bars'] = num_of_bars
df_bikes_bars_libs['avg_bar_rating'] = avg_bar_rating
df_bikes_bars_libs['min_bar_dist'] = min_bars_dist
df_bikes_bars_libs['num_of_libs'] = num_of_libs
df_bikes_bars_libs['avg_lig_rating'] = avg_lib_rating
df_bikes_bars_libs['min_lib_dist'] = min_libs_dist


# renaming column' 'unnamed: 0' to 'bike_station#'
df_bikes_bars_libs = df_bikes_bars_libs.rename(columns={'Unnamed: 0': 'bike_station#'})

# Exporting csv
df_bikes_bars_libs.to_csv('yelp_BarLibs_BikeRadius1.csv')

df_bikes_bars_libs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bikes_bars_libs['num_of_bars'] = num_of_bars
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bikes_bars_libs['avg_bar_rating'] = avg_bar_rating
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bikes_bars_libs['min_bar_dist'] = min_bars_dist
A value is trying to be set on a copy of a slice fro

Unnamed: 0,bike_station#,latitude,longitude,free_bikes,num_of_bars,avg_bar_rating,min_bar_dist,num_of_libs,avg_lig_rating,min_lib_dist
0,0,43.665269,-79.319796,18,16,4.00000,216.73,1,3.500000,789.09
1,1,43.671420,-79.445947,2,10,3.05000,598.29,2,3.750000,688.66
2,2,43.666224,-79.317693,10,16,3.71875,341.48,1,3.500000,765.56
3,3,43.653236,-79.376716,4,50,3.87000,130.70,10,3.500000,561.35
4,4,43.663722,-79.380288,23,50,3.66000,62.99,9,3.944444,594.79
...,...,...,...,...,...,...,...,...,...,...
245,245,43.685569,-79.408019,0,2,3.50000,800.23,0,,
246,246,43.651678,-79.375233,1,50,3.80000,70.31,8,3.562500,435.89
247,247,43.651838,-79.378743,1,50,3.86000,89.87,8,3.812500,399.87
248,248,43.665221,-79.394009,1,50,3.73000,357.20,19,3.973684,166.38


### NOTE:

Given the nature of the YELP API and its 500 call rate-limit, we were only able to feed in 250 of the bike stations' latitudes and longitudes into the API - this ultimately makes it difficult for comparison against the Foursquare API's results, where in that one, we have 717 results which will ultimately prove more effective when modelling later on. More data the better.

That being said, because the order in which the latitude and longitudes were fed into both API's were the same, we can compare the first 250 rows from each data sets as they will show the exact same bike stations in the exact same order.

Given more time and a higher rate limit, we could have ran the the remaining bike stations into the YELP api, ultimately creating 3 datasets (250, 250 and 217 lines long) and stack them into one dataset, giving us all 717 bike stations.