##### Connect to the Foursquare API
##### Connect to the Yelp API. This API offers similar services as Foursquare.
##### For each of the bike stations in Part 1, query both APIs to retrieve information for the following in that location:
##### Restaurants or bars
##### Various POIs (points of interest) of your choice
##### Create a DataFrame for the Yelp results and Foursquare results.
##### Compare the quality of the Yelp and Foursquare API. For your location, which API gives you the most complete information/better coverage? NOTE: Your definition of 'coverage' is up to you. It could be simple 'number of POIs in the area', but it could also be something more specific like 'number of reviews per POI', or 'number of different attributes of each POI'.
## Complete the yelp_foursquare_EDA.ipynb notebook to demonstrate how you executed the tasks above.

In [1]:
import pandas as pd
import os # use this to access your environment variables
import requests # this will be used to call the APIs
import numpy as np
import json #json parsing libraries

In [2]:
YELP_API_KEY = os.getenv('YELP_API_KEY')
FOURSQUARE_KEY = os.getenv('FOUR_SQUARE_API_KEY')

In [3]:
def get_venues_yelp(latitude, longitude, radius, api_key, categories):
    """
    Gets venues from yelp with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): Yelp API key to use for query
        categories (str) : Place types as found in https://docs.developer.yelp.com/docs/resources-categories
            If not passed no type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """
    url = "https://api.yelp.com/v3/businesses/search?"

    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {api_key}"
        }

    params = {
        "latitude": float(latitude),
        "longitude": float(longitude),
        'radius': radius,
        'categories': categories
    }

    response = requests.request("GET", url, headers=headers, params=params)
    return response

In [4]:
def get_venues_fs(latitude, longitude, radius, api_key, categories):
    """
    Gets venues from foursquare with a specified place type and coordinates.
    Args:
        latitude (float): latitude for query (must be combined with longitude)
        longitude (float): longitude for query (must be combined with latitude)
        api_key (str): foursquare API key to use for query
        categories (str) : Foursquare-recognized place types listed in: https://location.foursquare.com/places/docs/categories
            If not passed no place_type will be specified. Separate ids with commas
    
    Returns:
        response: response object from the requests library.
    """
    url = "https://api.foursquare.com/v3/places/search"
    params = {
        "ll": f"{latitude},{longitude}",
        'radius': radius,
        'categories': categories
    }
    headers = {
        "Accept": "application/json",
        "Authorization": api_key
    }
    response = requests.request("GET", url, headers=headers, params=params)
    return response

In [5]:
#These queries will allow us to call the Four Square and Yelp APIs to get information about locations 
#within a certain radius of the lat. and long. 
# But another function will need to be written to pass the data provided by the City Bikes API

In [6]:
def station_query(row):
    api_key = FOURSQUARE_KEY
    response = get_venues_fs(float(row['latitude']), float(row['longitude']), 5000, api_key, 19046)
    response = response.json()
    return response
    #19046 = CTA L station
    #19043 = Bus Stop
    #19054 = all public transit

In [14]:
def station_query_yelp(row):
    api_key = YELP_API_KEY
    response = get_venues_yelp(float(row['latitude']), float(row['longitude']), 5000, api_key, 'bikeshop')
    response = response.json()
    return response
#information about public transit is completely missing from Yelp, here I have elected to examine nearby bikeshops, 
#since my experience with bikeshare programs is related to returning somewhere after dropping off a bike to be repaired, 
#and wondering if such usage could be detected, since existing bike riders might be good customers to advertise to.

In [8]:
#the two functions above will apply the data from our previous query to the functions querying Four Square and Yelp

In [11]:
data = pd.read_json(r'unprocessed/Chi_9am_mon(raw).json')
data 
#an issue to be examined: the timestamp column appears to have lost information about the timezone, my attempts to reinsert that information failed, timestamps remain in GMT

Unnamed: 0,comp_id,name,latitude,longitude,slots,free_bikes,renting,timestamp
0,divvy,Lake Park Ave & 56th St,41.793242,-87.587782,19,13,1,2023-11-20 15:01:15.831
1,divvy,Ada St & Washington Blvd,41.882830,-87.661206,15,10,1,2023-11-20 15:01:13.757
2,divvy,Ashland Ave & Grace St,41.950687,-87.668700,15,13,1,2023-11-20 15:01:16.001
3,divvy,Clark St & Wrightwood Ave,41.929546,-87.643118,15,7,1,2023-11-20 15:01:13.758
4,divvy,Adler Planetarium,41.866095,-87.607267,39,12,1,2023-11-20 15:01:14.360
...,...,...,...,...,...,...,...,...
1658,divvy,Exchange Ave & 79th St,41.752029,-87.552096,11,3,1,2023-11-20 15:01:13.716
1659,divvy,Rainbow - Beach,41.757871,-87.549386,15,0,1,2023-11-20 15:01:14.071
1660,divvy,Noble St & Milwaukee Ave,41.900700,-87.662607,15,7,1,2023-11-20 15:01:15.367
1661,divvy,Columbus Ave & 79th St,41.749119,-87.704263,7,3,1,2023-11-20 15:01:15.516


In [13]:
data.describe() 
#here we can start to see some issues we might have in the future, 
#since the number of stations with zero or one bikes appears very high.
#to be watched

Unnamed: 0,latitude,longitude,slots,free_bikes,renting,timestamp
count,1663.0,1663.0,1663.0,1663.0,1663.0,1663
mean,41.850887,-87.678532,9.04991,3.60012,0.997595,2023-11-20 15:01:14.925148416
min,41.648501,-87.84396,1.0,0.0,0.0,2023-11-20 15:01:13.130000
25%,41.766388,-87.720899,2.0,0.0,1.0,2023-11-20 15:01:14.154500096
50%,41.867226,-87.67718,9.0,1.0,1.0,2023-11-20 15:01:15.017999872
75%,41.931753,-87.634429,15.0,6.0,1.0,2023-11-20 15:01:15.729499904
max,42.064854,-87.528232,55.0,34.0,1.0,2023-11-20 15:01:16.501000
std,0.097214,0.063382,8.543893,4.783966,0.048999,


In [None]:
response = data.apply(lambda row: station_query(row), axis=1) 
#I very nearly understand how these lambda functions are working, 
#they are applying the calls to query the API to each row and returning the result. This is good.

In [None]:
response2 = data.apply(lambda row: station_query_yelp(row), axis=1)
#same for yelp

In [None]:
response2[1]['businesses'] #exploring the data received from Yelp, we see that we have made a poor choice.
len(response2) #while the correct length is returned

In [None]:
#Since the data from Yelp is not helpful for investigating the relationship between CTA stations and bike stations
#We will be continuing with only the Four Square data

In [None]:
#Now the question is, what information do we need to investigate our question. 
# As addressed in the readme, I have elected to look at the relationship between public transit 
#and the bikeshare program in Chicago.

##### Information to get per bike station:
#####  how close is the closest 'l' stop?
##### how many stations are w/i 1.6km? (1 mi)
##### how many w/i 3k? (2 mi)

In [None]:
print(response[0].keys())
print(response[0]['results'][0].keys())
print(response[0]['results'][0]['distance'])
print(response[0]['results'][0]['location'])

In [None]:
# from these little probings we can see that the fields we want are within the results. 
# each query will return a list of CTA stations and give their distance from the lat. and long. provided

In [None]:
distance = []
near_station = []
num_stations_1mi = []
num_stations_2mi = []
def fs_append(response):
    for i in response:
        count1 = 0
        count2 = 0
        for l in i['results']:### adds to the count of stations, which will give an idea of public transit density and bike-share infrastructure
            if l['distance'] <1700:
                count1+=1
            elif l['distance'] < 3400:
                count2+=1
            else: 
                continue
        if len(i['results']) < 1:### if no results are returned, due to too small of a radius, this block will return 9999 as an error code
            distance.append(9999)
            near_station.append(9999)
            num_stations_1mi.append(9999)
            num_stations_2mi.append(9999)
            continue
        distance.append(i['results'][0]['distance']) ### Thankfully, four square returns results based on distance from the lat.,long. queried
        near_station.append(i['results'][0]['location']['formatted_address']) ### station name can be used to associate local densities of bike stations around CTA station
        num_stations_1mi.append(count1) ### according to statistics released by Divvy, most rides are quite short so we should expect the heaviest use by commuters to be at the stations within 1-2 miles of CTA stations 
        num_stations_2mi.append(count2)

In [None]:
fs_append(response)

In [None]:
len(distance) #since this returns the expected value, we have received the number of responses we expect and can build a dataframe from the results to examine further

In [None]:
df1 = pd.DataFrame()
df1['distance'] = distance
df1['near_station'] = near_station
df1['num_stations_1mi'] = num_stations_1mi
df1['num_stations_2mi'] = num_stations_2mi
df1['all_stations'] = df1['num_stations_1mi'] + df1['num_stations_2mi'] #this column could be dropped, but I wanted to keep it in case the total was needed for some calculation later

In [None]:
df1.loc[df1['distance']==9999] #check to see which entries have given us missing or incomplete data, which will appear as '9999'

In [None]:
#Any 9999 values should be dropped from the final table, but more importantly they will indicate that those stations are outside the area we are curious about, 
# the locations most likely to be used by commuters who have also accessed the CTA

In [None]:
df1.to_json(r'four_square_9am_mon.json') #these results will be saved as a .json to populate fields from our city bikes queries.
#since we do not expect either CTA stations or Divvy bike stations to move, this data will apply to all future sets.

In [21]:
df2 = pd.read_json('Chi_8am_mon_fs.json')

In [17]:
df3 = pd.read_json('Chi_640am_mon_fs.json')

In [22]:
df2.describe()

Unnamed: 0,latitude,longitude,slots,free_bikes,renting,timestamp,distance,num_stations_1mi,num_stations_2mi,transit_cat
count,1663.0,1663.0,1663.0,1663.0,1663.0,1663,1663.0,1663.0,1663.0,1663.0
mean,41.850887,-87.678532,9.04991,3.594107,0.998196,2023-11-20 14:05:46.840000,1577.116657,4.146723,5.364402,0.994588
min,41.648501,-87.84396,1.0,0.0,0.0,2023-11-20 14:05:46.840000,3.0,0.0,0.0,0.0
25%,41.766388,-87.720899,2.0,0.0,1.0,2023-11-20 14:05:46.840000,709.0,2.0,3.0,1.0
50%,41.867226,-87.67718,9.0,1.0,1.0,2023-11-20 14:05:46.840000,1365.0,3.0,6.0,1.0
75%,41.931753,-87.634429,15.0,6.0,1.0,2023-11-20 14:05:46.840000,2433.5,6.0,8.0,1.0
max,42.064854,-87.528232,55.0,31.0,1.0,2023-11-20 14:05:46.840000,3515.0,10.0,10.0,1.0
std,0.097214,0.063382,8.543893,4.685733,0.042448,,1034.226487,3.11166,2.9157,0.073388


In [28]:
df = pd.DataFrame()
df['distance'] = df2['distance']
df['near_station'] = df2['near_station']
df['num_stations_1mi'] = df2['num_stations_1mi']
df['num_stations_2mi'] = df2['num_stations_2mi']
df['all_stations'] = df['num_stations_1mi'] + df['num_stations_2mi']

In [29]:
df.to_json(r'four_square_9am_mon.json')

In [26]:
df

Unnamed: 0,distance,num_stations_1mi,num_stations_2mi,all_stations
0,653,8,2,10
1,384,9,1,10
2,432,10,0,10
3,500,10,0,10
4,1421,5,5,10
...,...,...,...,...
1658,3007,1,6,7
1659,3133,2,6,8
1660,1193,7,3,10
1661,1838,4,6,10


In [None]:
import datetime as dt
import pytz

In [None]:
df1['timestamp'] = df1['timestamp'].dt.tz_convert('US/Central')

In [None]:
df3['timestamp'] = df3['timestamp'].dt.tz_localize(pytz.timezone('US/Central')).dt.tz_convert(pytz.timezone('UTC'))
#FOR unknown reasons, at one point the above formula converted my data to central time, but in samples from the next day 
#the following days, the below example did.
#as far as I know I did not change anything

In [None]:
df1 = pd.read_json('Chi_8am_mon.json')

In [None]:
df1.head()

In [None]:
len(df1.loc[(df1['distance']==9999)])

In [None]:
df2 = pd.read_json('Chi_f30_Fri_fs.json')
df1 = pd.read_json('Chi_9am_mon_fs.json')

In [None]:
df2.loc[(df2['distance']==9999)]

In [None]:
df1['distance'] = df2['distance']
df1['near_station'] = df2['near_station']
df1['num_stations_1mi'] = df2['num_stations_1mi']
df1['num_stations_2mi'] = df2['num_stations_2mi']
df1['all_stations'] = df2['all_stations']

In [None]:
df2['transit_cat'] = 0 ### this will track stations which are beside (or nearly so) CTA stations
df2.loc[(df2['distance']<=50, 'transit_cat')] = 0 #w/in 50m (150ft) of an L station
df2.loc[(df2['distance']>50, 'transit_cat')] = 1 #outside 150ft of an L station

In [None]:
df1['transit_cat'] = 0 ### this will track stations which are beside (or nearly so) CTA stations
df1.loc[(df1['distance']<=50, 'transit_cat')] = 0 #w/in 50m (150ft) of an L station
df1.loc[(df1['distance']>50, 'transit_cat')] = 1 #outside 150ft of an L station

In [None]:
df2

In [None]:
dl = pd.read_json('Chi_9am_mon_fs.json')

In [None]:
len(dl.loc[dl['transit_cat']==0])

In [None]:
len(df1.loc[df1['transit_cat']==0])

In [None]:
data.describe()

In [None]:
data = pd.read_json(r'chi_five30_thurs.json')