In [104]:
import pandas as pd
import time
pd.set_option('display.max_rows',200)

#### Learning how the API works

In [2]:
! curl -H 'Authorization: Token yl7ekm12gBBo4bfSVcbsYRIZWOHqpmNdwabQZRyR' \
     -H 'Accept-Language: en_US' \
     -H 'Content-Type: application/json' \
     'https://api.uber.com/v1.2/estimates/price?start_latitude=37.7752315&start_longitude=-122.418075&end_latitude=37.7752415&end_longitude=-122.518075'

{"prices":[{"localized_display_name":"Black","distance":6.32,"display_name":"Black","product_id":"d4abaae7-f4d6-4152-91cc-77523e8165a4","high_estimate":59.0,"low_estimate":53.0,"duration":1500,"estimate":"$53-59","currency_code":"USD"},{"localized_display_name":"UberX","distance":6.32,"display_name":"UberX","product_id":"a1111c8c-c720-46c3-8534-2fcdd730040d","high_estimate":28.0,"low_estimate":26.0,"duration":1500,"estimate":"$26-28","currency_code":"USD"},{"localized_display_name":"UberXL","distance":6.32,"display_name":"UberXL","product_id":"821415d8-3bd5-4e27-9604-194e4359a449","high_estimate":41.0,"low_estimate":37.0,"duration":1500,"estimate":"$37-41","currency_code":"USD"},{"localized_display_name":"Select","distance":6.32,"display_name":"Select","product_id":"57c0ff4e-1493-4ef9-a4df-6b961525cf92","high_estimate":46.0,"low_estimate":42.0,"duration":1500,"estimate":"$42-46","currency_code":"USD"},{"localized_display_name":"Black SUV","distance":6.32,"display_name":"Black SUV","pro

#### Trying the Python library released by them for the same

In [3]:
from uber_rides.session import Session
from uber_rides.client import UberRidesClient
import pandas as pd
session = Session(server_token='yl7ekm12gBBo4bfSVcbsYRIZWOHqpmNdwabQZRyR')
client = UberRidesClient(session)

In [4]:
response = client.get_price_estimates(
    start_latitude=37.770,
    start_longitude=-122.411,
    end_latitude=37.791,
    end_longitude=-122.405,
    seat_count=2
)

In [5]:
response = client.get_price_estimates(
    start_latitude=40.689515,
    start_longitude=-74.176778,
    end_latitude=40.625724,
    end_longitude=-73.826141,
    seat_count=2
)

In [6]:
prices = pd.DataFrame(response.json.get('prices'))

In [7]:
prices

Unnamed: 0,currency_code,display_name,distance,duration,estimate,high_estimate,localized_display_name,low_estimate,product_id
0,USD,UberXL,38.64,5280,$151-167,167.0,UberXL,151.0,a539ddeb-a2e4-43b5-9c51-3a53e0c74c0c
1,USD,UberPOOL,38.64,5280,$117-128,129.0,UberPOOL,117.0,3145c334-25c6-462d-a2f5-70c38a165746
2,USD,Black,38.64,5280,$265-291,291.0,Black,265.0,3a50283f-e905-4290-ae35-411646da9231
3,USD,Black SUV,38.64,5280,$312-344,344.0,Black SUV,312.0,3de7e395-d50b-4577-af55-78aa047cf47f
4,USD,UberX,38.64,5280,$127-139,139.0,UberX,127.0,1b64bf82-a0ba-4b0f-be32-df8d05481d7e
5,USD,Comfort,38.64,5280,$153-168,168.0,Comfort,153.0,58aa1c9f-9b26-4e58-9aa2-72963e6ad394
6,USD,UberX,38.64,5280,$127-139,139.0,UberX,127.0,bbec56dc-1c72-44ea-ba64-fe51bf392c09


In [8]:
prices[(prices['display_name']=='UberPOOL') | (prices['display_name']=='UberPool')]

Unnamed: 0,currency_code,display_name,distance,duration,estimate,high_estimate,localized_display_name,low_estimate,product_id
1,USD,UberPOOL,38.64,5280,$117-128,129.0,UberPOOL,117.0,3145c334-25c6-462d-a2f5-70c38a165746


#### Seeing whether code A or B is faster:

In [9]:
# Code A: Slicing the dataframe twice to get each attribute and then, computing inline without storing any intermediate values.
x = time.time()
for i in range(100):
    (prices[prices['display_name']=='UberPool']['high_estimate'] + prices[prices['display_name']=='UberPool']['low_estimate'])/2
time.time() -x 

0.20588994026184082

In [10]:
# Code B: Slicing once, storing the slice and then, computing.
x = time.time()
for i in range(100):
    prices = prices[prices['display_name']=='UberPool']
    (prices.low_estimate+prices.high_estimate)/2
time.time() -x 

0.1122140884399414

#### As you can see, Code B runs faster

### Getting the prices for the taxi zones

In [11]:
zones = pd.read_csv('Data/Taxi_Zones/lat_long_for_API.csv',index_col=0)

In [12]:
zones.reset_index(drop=True,inplace=True)

In [13]:
zones.head()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff
0,1,-74.176778,40.689515,2,-73.826141,40.625724
1,1,-74.176778,40.689515,3,-73.849479,40.865871
2,1,-74.176778,40.689515,4,-73.977024,40.724151
3,1,-74.176778,40.689515,5,-74.189938,40.550339
4,1,-74.176778,40.689515,6,-74.067786,40.599053


In [14]:
zones.tail()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff
68893,263,-73.951209,40.778495,258,-73.856646,40.69014
68894,263,-73.951209,40.778495,259,-73.856399,40.89908
68895,263,-73.951209,40.778495,260,-73.903715,40.74679
68896,263,-73.951209,40.778495,261,-74.012919,40.708976
68897,263,-73.951209,40.778495,262,-73.94583,40.776536


In [15]:
def get_Uber_price(start_latitude, start_longitude, end_latitude, end_longitude, seat_count=2):
    '''Function to get the current price of Uber Pool and Uber X using the Uber API
    
    Arguments:
        start_latitude (float)
            The latitude component of a start location.
        start_longitude (float)
            The longitude component of a start location.
        end_latitude (float)
            The latitude component of a end location.
        end_longitude (float)
            The longitude component of a end location.
        seat_count (int)
            The number of seats required for uberPOOL.Default and maximum value is 2.

    Returns:
        pool_price: The current average Uber Pool Price for the given parameters.
        X_price: The current average UberX Price for the given parameters.
    '''
    # Calling the Uber API to get the estimate
    response = client.get_price_estimates(start_latitude=start_latitude, start_longitude=start_longitude, \
                                          end_latitude=end_latitude, end_longitude=end_longitude, seat_count=2)
    # Converting the Json file containing the estimates to a dataframe
    prices = pd.DataFrame(response.json.get('prices'))
    # Retaining only the price for Pool and X
    price_X = prices[(prices['display_name']=='UberX')][:1]
    price_pool = (prices[(prices['display_name']=='UberPOOL') | (prices['display_name']=='UberPool')])
    # Returning the average price shown for Pool and X
    if price_pool.empty:
        return(float('nan'),float((price_X[:1].low_estimate+price_X[:1].high_estimate)/2))
#     print(price.head())
    return(float((price_pool[:1].low_estimate+price_pool[:1].high_estimate)/2),float((price_X[:1].low_estimate+price_X[:1].high_estimate)/2))

In [16]:
len(zones)*1.0/2000

34.449

In [17]:
len(zones)

68898

In [43]:
Server_tokens = ['yl7ekm12gBBo4bfSVcbsYRIZWOHqpmNdwabQZRyR','oHhBrthXZH2ky2fJuwyrPAICJFWT4AlUzGQ8Yja8','l_seoFJittkNTgOLMa2txwGJX915nw35AItTLtjx']

### After 22530, there are NAN values for many rows as there seemed to be some issue with the API. Try again later.

In [158]:
price_pool = []
price_X = []
i= 1
while i < len(zones):
    start_time = time.time()
    for token in Server_tokens:
        session = Session(server_token = token)
        client = UberRidesClient(session)
        
        for index, row in zones[i:i+2000].iterrows():
#             print(index)
#             print(row)
            if index>len(zones): break
            pool,X = (get_Uber_price(start_latitude=row.lat_pickup, start_longitude=row.lon_pickup, \
                                                  end_latitude=row.lat_dropoff, end_longitude=row.lon_dropoff))
    #         if pool is None or X is None:
    #             time.sleep(3600.0 - ((time.time() - starttime) % 3600.0))
            price_pool.append(pool)
            price_X.append(X)
            if index%100==0: print(index)
            last = index
        i+=2000
    time.sleep(3600.0 - ((time.time() - start_time) % 3600.0))

67500
67600
67700
67800
67900
68000
68100
68200
68300
68400
68500
68600
68700
68800


In [161]:
uber_price = zones[67471:last+1]

In [159]:
last

68897

In [162]:
len(price_X)

1427

In [163]:
# price_X

In [164]:
uber_price['Pool_Price'] = price_pool
uber_price['Non_shared_Price'] = price_X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [165]:
uber_price.shape

(1427, 8)

In [166]:
uber_price.dropna().shape

(641, 8)

In [167]:
uber_price.tail()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff,Pool_Price,Non_shared_Price
68893,263,-73.951209,40.778495,258,-73.856646,40.69014,,68.0
68894,263,-73.951209,40.778495,259,-73.856399,40.89908,,46.5
68895,263,-73.951209,40.778495,260,-73.903715,40.74679,,26.0
68896,263,-73.951209,40.778495,261,-74.012919,40.708976,,36.5
68897,263,-73.951209,40.778495,262,-73.94583,40.776536,,11.5


In [168]:
uber_price.to_csv('Data/Uber/Uber_price_69897.csv')

#### Concatenating the uber data from all CSV files

In [64]:
import pandas as pd
import glob, os    
df = pd.concat(map(lambda file: pd.read_csv(file, index_col=0), glob.glob(os.path.join('Data/Uber', '*.csv'))))

In [65]:
df.shape

(68898, 8)

In [66]:
df.sort_index(inplace=True)

In [67]:
df.head()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff,Pool_Price,Non_shared_Price
0,1,-74.176778,40.689515,2,-73.826141,40.625724,126.5,137.5
1,1,-74.176778,40.689515,3,-73.849479,40.865871,89.5,92.5
2,1,-74.176778,40.689515,4,-73.977024,40.724151,68.5,74.5
3,1,-74.176778,40.689515,5,-74.189938,40.550339,55.0,55.5
4,1,-74.176778,40.689515,6,-74.067786,40.599053,54.0,57.5


In [68]:
df.tail()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff,Pool_Price,Non_shared_Price
68893,263,-73.951209,40.778495,258,-73.856646,40.69014,,68.0
68894,263,-73.951209,40.778495,259,-73.856399,40.89908,,46.5
68895,263,-73.951209,40.778495,260,-73.903715,40.74679,,26.0
68896,263,-73.951209,40.778495,261,-74.012919,40.708976,,36.5
68897,263,-73.951209,40.778495,262,-73.94583,40.776536,,11.5


In [55]:
df.drop(['lon_pickup','lat_pickup','lon_dropoff','lat_dropoff'],axis=1,inplace=True)

In [50]:
# Writing out the final dataset
df.to_csv('Data/Uber_final/uber_data.csv')

#### Trying to fix the NANs previously collected due to the API error

In [69]:
# Reading in the final dataset
uber_data = pd.read_csv('Data/Uber_final/uber_data.csv',index_col=0)

In [70]:
uber_data.head()

Unnamed: 0,LocationID_pickup,LocationID_dropoff,Pool_Price,Non_shared_Price
0,1,2,126.5,137.5
1,1,3,89.5,92.5
2,1,4,68.5,74.5
3,1,5,55.0,55.5
4,1,6,54.0,57.5


In [86]:
# Finding indexes of all rows with a NAN in them
# It will be in the pool price column ideally but still, not restricting the search to that in case Non-shared price has any NANs
uber_redo = list(uber_data[uber_data.isnull().any(axis=1)].index)

In [88]:
# Getting the lat-longs for the origin-destination combinations containing NANs to feed them into the API again
zones_remaining = zones.loc[uber_redo]

In [101]:
zones_remaining.head()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff
22530,87,-74.007812,40.706659,1,-74.176778,40.689515
22531,87,-74.007812,40.706659,2,-73.826141,40.625724
22532,87,-74.007812,40.706659,3,-73.849479,40.865871
22533,87,-74.007812,40.706659,4,-73.977024,40.724151
22534,87,-74.007812,40.706659,5,-74.189938,40.550339


In [103]:
price_pool = []
price_X = []
i= 0
while i < len(zones_remaining):
    start_time = time.time()
    for token in Server_tokens:
        session = Session(server_token = token)
        client = UberRidesClient(session)
        
        for index, row in zones_remaining[i:i+2000].iterrows():
#             print(index)
#             print(row)
            if index>len(zones): break
            pool,X = (get_Uber_price(start_latitude=row.lat_pickup, start_longitude=row.lon_pickup, \
                                                  end_latitude=row.lat_dropoff, end_longitude=row.lon_dropoff))
    #         if pool is None or X is None:
    #             time.sleep(3600.0 - ((time.time() - starttime) % 3600.0))
            price_pool.append(pool)
            price_X.append(X)
            if index%100==0: print(index)
            last = index
        i+=2000
    time.sleep(3600.0 - ((time.time() - start_time) % 3600.0))

22600
22700
22800
22900
23000
23400
23500
41200
41300
42000
42100
42200
42300
42400
42500
42600
42700
42800
42900
43300
43400
44300
44400
44500
48500
48600
48700
62100
62200
62300
62400
62500
62600
64200
64300
64400
65000
65100
65200
68200
68300
68400
68500
68600
68700
68800


In [105]:
uber_price = zones.loc[uber_redo]

In [106]:
last

68897

In [107]:
len(price_X)

4796

In [108]:
uber_price['Pool_Price'] = price_pool
uber_price['Non_shared_Price'] = price_X

In [109]:
uber_price.shape

(4796, 8)

In [110]:
uber_price.dropna().shape

(4796, 8)

#### Was able to get the price now which is surprising. API must have a few glitches I guess.

In [111]:
uber_price.tail()

Unnamed: 0,LocationID_pickup,lon_pickup,lat_pickup,LocationID_dropoff,lon_dropoff,lat_dropoff,Pool_Price,Non_shared_Price
68893,263,-73.951209,40.778495,258,-73.856646,40.69014,55.0,59.0
68894,263,-73.951209,40.778495,259,-73.856399,40.89908,44.5,48.0
68895,263,-73.951209,40.778495,260,-73.903715,40.74679,29.0,37.5
68896,263,-73.951209,40.778495,261,-74.012919,40.708976,26.5,29.5
68897,263,-73.951209,40.778495,262,-73.94583,40.776536,9.0,11.5


In [112]:
uber_data.loc[uber_redo] = uber_price

In [114]:
uber_data.tail()

Unnamed: 0,LocationID_pickup,LocationID_dropoff,Pool_Price,Non_shared_Price
68893,263,258,55.0,59.0
68894,263,259,44.5,48.0
68895,263,260,29.0,37.5
68896,263,261,26.5,29.5
68897,263,262,9.0,11.5


In [119]:
uber_data.dropna().shape

(68898, 4)

In [121]:
# Writing out the final dataset
uber_data.to_csv('Data/Uber_final/uber_data_final.csv')