**A couple of functions meant to call the google api 200 times for each bike, car and transit routes from 200 randomly picked rides from the New York Citibike data**

In [3]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import json
api_key="that's private"

In [71]:
# call the main function
df=get_distances()

  df=pd.concat(df_list, 1, ignore_index=True).rename(columns={0:"Distance bike", 1:"Duration bike",2:"Distance transit", 3:"Duration transit", 4:"Distance car", 5:"Duration car"})


In [72]:
# save the df
df.to_pickle("./df_maps")

In [4]:
# read the df
df=pd.read_pickle("./df_maps")

In [36]:
# get the delta of the bike and transit car data

# get the numerical values
# the position depends if it's "min" or "mins"
bike_dur=df["Duration bike"].apply(lambda x:int(x[:-5]) if "mins" in x else int(x[:-4]))
car_dur=df["Duration car"].apply(lambda x:int(x[:-5]) if "mins" in x else int(x[:-4]))

# calculate the delta
dif_bike_car=bike_dur-car_dur

# print the results
print(dif_bike_car.describe())
print(dif_bike_car.median())

count    200.00000
mean      -0.02500
std        2.78625
min       -6.00000
25%       -2.00000
50%        0.00000
75%        1.00000
max       16.00000
dtype: float64
0.0


In [37]:
# get the delta of the bike and transit travel data

# get the mean of the transit duration values
transit_mean=df["Duration transit"].apply(lambda x:int(x[:-5]) if "mins" in x else 0).mean()

# turn the duration values into strings
# necessary as there are some nan values which have been turned to zeros
df["Duration transit"]=df["Duration transit"].apply(lambda x:str(x))

# get the numerical values
# the position depends if it's "min" or "mins"
bike_dur=df["Duration bike"].apply(lambda x:int(x[:-5]) if "mins" in x else int(x[:-4]))

# get the numerical values
# if it's zero, get the transit mean
trans_dur=df["Duration transit"].apply(lambda x:int(x[:-5]) if "mins" in x else transit_mean)

# calculate the delta
dif_bike_trans=bike_dur-trans_dur

# print the results
print(dif_bike_trans.describe())
dif_bike_trans.median()

count    200.000000
mean      -6.662700
std        4.940848
min      -23.000000
25%       -9.000000
50%       -6.000000
75%       -4.000000
max        7.000000
dtype: float64


-6.0

In [3]:
def get_data():
    """
    read the ny data and extract 200 random samples
    
    inputs:
      - row: a df row
    returns:
      - df_sample: a df containing 200 random datapoints
    """
    # read the ny df
    df_blank=pd.read_pickle("./df_blank.pkl")
    # exclude the blank rows
    df_blank=df_blank[~df_blank.isnull().any(axis=1)]
    # exclude quebec
    df_ny=df_blank[df_blank['start station latitude']<45]
    # get 200 random samples
    df_sample=df_ny[df_ny["usertype"]=="Subscriber"].sample(n=200)
    
    return df_sample

In [4]:
def get_coordinates(row):
    """
    extract start and end longitude and latitude from the rows
    and turns them into a string usable by the api
    
    inputs:
      - row: a df row
    returns:
      - start: the starting coordinates
      - end: the end coordinates
    """
    start=str(row["start station latitude"]) +","+ str(row["start station longitude"])
    end=str(row["end station latitude"]) +","+ str(row["end station longitude"])
    
    return start, end

In [31]:
def create_start_time(date):
    """
    create the departure time for the api
    as google wont take historical data the timestamp has to be in the future
    makes sure it's the same time (h:m:s) as well as the same weekday
    
    inputs:
      - date: a string containing the full date including the time
    returns:
      - df: a pandas dataframe containing the distance and duration for
              cars, bikes and transit and the starttime of the call
    """

    form='%Y-%m-%d %H:%M:%S'
    
    # omits the miliseconds
    date=datetime.strptime(date[:-5], form)
    
    # adds year and month from the date
    year=str(datetime.now().year)
    month=str(datetime.now().month)
    
    # the starttime needs to have the same weekday
    # makes sure to add the right amount of days to the current day
    date_delta=7+date.weekday()-datetime.now().weekday()
    day=str(datetime.now().day+date_delta)
    
    # adds hour, minute and second from the date
    hour=str(date.hour)
    minute=str(date.minute)
    second=str(date.second)
    
    # add the variables to form the date string
    date_str=year+"-"+month+"-"+day+" "+hour+":"+minute+":"+second
    
    # turns the date string into a datetime object
    # than into a timestamp, which is turned into an int and finally to a string
    # also adds six hours due to time zones
    start_time=str(int(datetime.strptime(date_str, form).timestamp()+timedelta(hours=6).seconds))

    return start_time

In [70]:
def get_distances():
    """
    gets the sample dataframe and iterates through the rows
    calling the functions for the extraction of duration and distance information
    and turning them into dataframes
    
    returns:
      - df: a pandas dataframe containing the distance and duration for
              cars, bikes and transit and the starttime of the call
    """
    
    # initialize the lists
    bike=[]
    transit=[]
    car=[]
    
    # get the sample df
    df_sample=get_data()
    
    #iterate through the rows
    for i, row in df_sample.iterrows():
        
        # extract the coordinates
        start, end=get_coordinates(row)
        
        # extract the start time
        starttime=create_start_time(row["starttime"])
        
        # create the urls
        url_bike, url_transit, url_car=create_urls(start, end, start_time)
        
        # get the responses through the api
        response_bike, response_transit, response_car=get_response(url_bike, url_transit, url_car)
        
        # append the results to a list
        bike.append(extract_distance_and_duration(response_to_dict(response_bike)))
        transit.append(extract_distance_and_duration(response_to_dict(response_transit)))
        car.append(extract_distance_and_duration(response_to_dict(response_car)))
    
    # turn the lists to dataframes
    bike_df=pd.DataFrame(bike)
    transit_df=pd.DataFrame(transit)
    car_df=pd.DataFrame(car)
    
    # concat the dataframes
    df_list=[bike_df, transit_df, car_df]
    df=pd.concat(df_list, 1, ignore_index=True).rename(columns={0:"Distance bike", 1:"Duration bike",2:"Distance transit", 3:"Duration transit", 4:"Distance car", 5:"Duration car"})
    
    # add the starttime
    df["starttime"]=df_sample["starttime"].apply(lambda x:x[11:19])
    
    return df

In [9]:
def create_urls(start, end, start_time):
    """
    creates the urls necessary for the api calls
    
    inputs:
      - start: the starting coordinates
      - end: the goal coordinates
      - start_time: the timestamp
      
    returns:
      - url_bike: the url for the bike call
      - url_transit: the url for the transit call
      - url_car: the url for the car call
    """
    
    # build the url strings
    url_bike = ("https://maps.googleapis.com/maps/api/directions/json?"
            "origin="+start+
            "&destination="+end+
            "&departure_time="+start_time+
            "&mode=bicycling"
            "&key="+api_key)

    url_transit = ("https://maps.googleapis.com/maps/api/directions/json?"
            "origin="+start+
            "&destination="+end+
            "&departure_time="+start_time+
            "&mode=transit"
            "&key="+api_key)

    url_car = ("https://maps.googleapis.com/maps/api/directions/json?"
            "origin="+start+
            "&destination="+end+
            "&departure_time="+start_time+
            "&mode=driving"
            "&key="+api_key)

    return url_bike, url_transit, url_car

In [10]:
def get_response(url_bike, url_transit, url_car):
    """
    get the response through the api
    
    inputs:
      - url_bike: the url for the bike call
      - url_transit: the url for the transit call
      - url_car: the url for the car call
      
    returns:
      - response_bike: the response of the bike call
      - response_transit: the response of the transit call
      - response_car: the response of the car call
    """
    payload={}
    headers = {}

    response_bike = requests.request("GET", url_bike, headers=headers, data=payload)
    response_transit = requests.request("GET", url_transit, headers=headers, data=payload)
    response_car = requests.request("GET", url_car, headers=headers, data=payload)
    
    return response_bike, response_transit, response_car

In [11]:
def response_to_dict(response):
    """
    turn the response into a dict
    
    inputs:
      -response: a response object
    returns:
      -obj: a dict containing the response information
    """
    
    # load the response text
    json_string = response.text
    
    # turn the text into a dict
    obj = json.loads(json_string)
    
    return obj

In [12]:
def extract_distance_and_duration(obj):
    """
    extract the relevant information
    
    inputs:
      -obj: a dict containing the response information
    returns:
      -dis: the distance
      -dur: the duration
    """
    
    # extract the distance and the duration
    try:
        dis=obj['routes'][0]["legs"][0]["distance"]["text"]
        dur=obj['routes'][0]["legs"][0]["duration"]["text"]
        
    # some calls don't have route and will return a nearly empty dict
    except:
        dis=0
        dur=0
    
    return dis, dur