In [114]:
import pandas as pd
import numpy as np

data = pd.read_csv('input/train.csv', nrows = 5_000_000, 
                  usecols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'fare_amount'])

data = data.dropna()

# Remove latitude and longtiude outliers
data = data.loc[data['pickup_latitude'].between(40, 42)]
data = data.loc[data['pickup_longitude'].between(-75, -72)]
data = data.loc[data['dropoff_latitude'].between(40, 42)]
data = data.loc[data['dropoff_longitude'].between(-75, -72)]

In [115]:
data_rounded = data.round(3).groupby(['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])['passenger_count'].\
                            count().reset_index().\
                            drop(columns = 'passenger_count')

len(data_rounded)

2793390

In [116]:
data_rounded['pickup'] = data_rounded['pickup_latitude'].astype(str) + "," + data_rounded['pickup_longitude'].astype(str)
data_rounded['dropoff'] = data_rounded['dropoff_latitude'].astype(str) + "," + data_rounded['dropoff_longitude'].astype(str)
data_rounded['l'] = data_rounded['pickup'] + "|" + data_rounded['dropoff']
data_rounded.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup,dropoff,l
0,40.034,-74.276,40.034,-74.276,"40.034,-74.276","40.034,-74.276","40.034,-74.276|40.034,-74.276"
1,40.053,-74.968,40.041,-74.96,"40.053,-74.968","40.041,-74.96","40.053,-74.968|40.041,-74.96"
2,40.054,-73.017,40.057,-72.987,"40.054,-73.017","40.057,-72.987","40.054,-73.017|40.057,-72.987"
3,40.057,-72.963,40.087,-73.036,"40.057,-72.963","40.087,-73.036","40.057,-72.963|40.087,-73.036"
4,40.061,-73.532,40.065,-73.574,"40.061,-73.532","40.065,-73.574","40.061,-73.532|40.065,-73.574"


In [117]:
import googlemaps

with open('/home/ec2-user/keys/gmap.txt', 'r') as f:
    gmap_key = str(f.read()).strip()
    
# Authenticate with google maps
gmaps = googlemaps.Client(key=gmap_key)

In [118]:
from tqdm import tqdm_notebook

tqdm.pandas()

def row_proc(l):
    pickup, dropoff = l.split('|')
    geocode_result = gmaps.distance_matrix(pickup,dropoff)
    #print (geocode_result)
    try:
        distance = float(geocode_result['rows'][0]['elements'][0]['distance']['text'].split()[0])
        duration = geocode_result['rows'][0]['elements'][0]['duration']['text'].split()
        
        if len(duration)==4:
            mins = float(duration[0])*60 + float(duration[2])
        
        elif len(duration) > 4:
            print(geocode_result)
            
        else:
            mins = float(duration[0])
    except:
        mins = np.nan
        distance = np.nan
        
    try:
        origin = geocode_result['origin_addresses']
        dest = geocode_result['destination_addresses']
    
    except:
        origin = np.nan
        distance = np.nan
        
    return (distance, mins, origin, dest)

In [120]:
pickup, dropoff = data_rounded.loc[1100, 'l'].split('|')
geocode_result = gmaps.distance_matrix(pickup, dropoff)

geocode_result

{'destination_addresses': ['2565 Ocean Pkwy, Brooklyn, NY 11235, USA'],
 'origin_addresses': ['551 Avenue Y, Brooklyn, NY 11235, USA'],
 'rows': [{'elements': [{'distance': {'text': '0.4 km', 'value': 397},
     'duration': {'text': '2 mins', 'value': 107},
     'status': 'OK'}]}],
 'status': 'OK'}

In [121]:
pickup

'40.589,-73.965'

In [122]:
dropoff

'40.587,-73.966'

In [129]:
row_proc(data_rounded.loc[15000, 'l'])

(29.8,
 44.0,
 ['Terminal 1, 1 J F K Airport, Jamaica, NY 11430, USA'],
 ['525 E 13th St, New York, NY 10009, USA'])

In [130]:
data_rounded.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup,dropoff,l
0,40.034,-74.276,40.034,-74.276,"40.034,-74.276","40.034,-74.276","40.034,-74.276|40.034,-74.276"
1,40.053,-74.968,40.041,-74.96,"40.053,-74.968","40.041,-74.96","40.053,-74.968|40.041,-74.96"
2,40.054,-73.017,40.057,-72.987,"40.054,-73.017","40.057,-72.987","40.054,-73.017|40.057,-72.987"
3,40.057,-72.963,40.087,-73.036,"40.057,-72.963","40.087,-73.036","40.057,-72.963|40.087,-73.036"
4,40.061,-73.532,40.065,-73.574,"40.061,-73.532","40.065,-73.574","40.061,-73.532|40.065,-73.574"


## Time How Long it Takes for 100 records

In [131]:
%%timeit -n 3 -r 1
data_rounded.loc[:99, 'l'].apply(lambda x: row_proc(x))

7.82 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)


## Multithreading

In [132]:
import threading
from queue import Queue
import requests
import bs4
from timeit import default_timer as timer

### Time Multithreading with 100 threads

In [133]:
%%timeit -n 3 -r 1

print_lock = threading.Lock()

l_list = list(data_rounded.loc[:99, 'l'])

# Create a new queue
l_queue = Queue()

# List to hold results
run_l = []
distances = []
durations = []
origins = []
dests = []

# Function to take an element from the queue and execute task
def process_queue():
    while True:
        # Get next element from the queue
        current_l = l_queue.get()
        # Record the coordinates
        run_l.append(current_l)
        
        # Run the function
        r = row_proc(current_l)
        
        # Record the results
        distances.append(r[0])
        durations.append(r[1])
        origins.append(r[2])
        dests.append(r[3])
        
        # Signal to the queue that the task is done
        l_queue.task_done()

# Start 100 threads
for i in range(100):
    t = threading.Thread(target = process_queue)
    # Set daemon to be true
    t.daemon = True
    t.start()
    
# Put each element of the list on the queue
for current_l in l_list:
    l_queue.put(current_l)

# Execture the queue
l_queue.join()

end = timer()

1.07 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)


A seven time speed-up!

In [135]:
len(data_rounded) * (1 / 100) / 3600

7.759416666666667

#### Multithreading for real

In [137]:
import time
for i in range(10):
    time.sleep(1)
    print(f'{100 * (i / 10)}% complete', end = '\r')

90.0% complete

In [None]:
start = timer()

print_lock = threading.Lock()

l_list = list(data_rounded['l'])

# Create a new queue
l_queue = Queue()

# List to hold results
run_l = []
distances = []
durations = []
origins = []
dests = []
TRACKER = 0

# Function to take an element from the queue and execute task
def process_queue():
    while True:
        global TRACKER
        TRACKER += 1
        # Get next element from the queue
        current_l = l_queue.get()
        # Record the coordinates
        run_l.append(current_l)
        
        # Run the function
        r = row_proc(current_l)
        
        # Record the results
        distances.append(r[0])
        durations.append(r[1])
        origins.append(r[2])
        dests.append(r[3])
        
        if TRACKER % 100000 == 0:
            time_per_record = (timer() - start) / TRACKER 
            print(f'{round(100 * (TRACKER / len(data_rounded)), 2)}% complete.', end = '\r')
            estimated_time = ((len(data_rounded) - TRACKER) * time_per_record) / 3600
            print(f'Estimated time remaining: {estimated_time} hours.', end = '\r')
            
        # Signal to the queue that the task is done
        l_queue.task_done()

# Start 100 threads
for i in range(100):
    t = threading.Thread(target = process_queue)
    # Set daemon to be true
    t.daemon = True
    t.start()
    
# Put each element of the list on the queue
for current_l in l_list:
    l_queue.put(current_l)

# Execture the queue
l_queue.join()

end = timer()

7.159759288892707% complete..

In [125]:
test = pd.read_csv('input/test.csv', 
                  usecols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 
                             'dropoff_longitude', 'passenger_count'])
test_rounded = test.round(3).groupby(['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])['passenger_count'].\
                            count().reset_index().\
                            drop(columns = 'passenger_count')

len(test_rounded)

9870

In [127]:
test_rounded['pickup'] = test_rounded['pickup_latitude'].astype(str) + "," + test_rounded['pickup_longitude'].astype(str)
test_rounded['dropoff'] = test_rounded['dropoff_latitude'].astype(str) + "," + test_rounded['dropoff_longitude'].astype(str)
test_rounded['l'] = test_rounded['pickup'] + "|" + test_rounded['dropoff']
test_rounded.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup,dropoff,l
0,40.573,-74.221,40.569,-74.217,"40.573,-74.221","40.569,-74.217","40.573,-74.221|40.569,-74.217"
1,40.583,-74.252,40.588,-74.263,"40.583,-74.252","40.588,-74.263","40.583,-74.252|40.588,-74.263"
2,40.588,-73.974,40.593,-73.973,"40.588,-73.974","40.593,-73.973","40.588,-73.974|40.593,-73.973"
3,40.605,-73.98,40.603,-73.973,"40.605,-73.98","40.603,-73.973","40.605,-73.98|40.603,-73.973"
4,40.623,-73.988,40.623,-73.988,"40.623,-73.988","40.623,-73.988","40.623,-73.988|40.623,-73.988"


In [128]:
row_proc(test_rounded.loc[1, 'l'])

(2.0,
 4.0,
 ['443 Federal Blvd, Carteret, NJ 07008, USA'],
 ['1392 Rahway Ave, Avenel, NJ 07001, USA'])

In [88]:
print(f'{round(end - start, 2)} seconds elapsed.')

31.57 seconds elapsed.


In [86]:
len(test_rounded)

2995

In [82]:
test_rounded.loc[:10]

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup,dropoff,distance,duration,l
0,40.57,-74.22,40.57,-74.22,"40.57,-74.22","40.57,-74.22",1.0,1.0,"40.57,-74.22|40.57,-74.22"
1,40.58,-74.25,40.59,-74.26,"40.58,-74.25","40.59,-74.26",3.6,8.0,"40.58,-74.25|40.59,-74.26"
2,40.59,-73.97,40.59,-73.97,"40.59,-73.97","40.59,-73.97",1.0,1.0,"40.59,-73.97|40.59,-73.97"
3,40.6,-73.98,40.6,-73.97,"40.6,-73.98","40.6,-73.97",1.5,6.0,"40.6,-73.98|40.6,-73.97"
4,40.62,-73.99,40.62,-73.99,"40.62,-73.99","40.62,-73.99",1.0,1.0,"40.62,-73.99|40.62,-73.99"
5,40.63,-73.98,40.63,-73.98,"40.63,-73.98","40.63,-73.98",1.0,1.0,"40.63,-73.98|40.63,-73.98"
6,40.63,-73.97,40.63,-73.98,"40.63,-73.97","40.63,-73.98",1.8,9.0,"40.63,-73.97|40.63,-73.98"
7,40.63,-73.95,40.59,-73.93,"40.63,-73.95","40.59,-73.93",5.8,17.0,"40.63,-73.95|40.59,-73.93"
8,40.63,-73.9,40.64,-73.9,"40.63,-73.9","40.64,-73.9",1.8,5.0,"40.63,-73.9|40.64,-73.9"
9,40.63,-73.88,40.63,-73.89,"40.63,-73.88","40.63,-73.89",1.2,5.0,"40.63,-73.88|40.63,-73.89"


In [83]:
%%timeit -n 3 -r 2
test_rounded.loc[:9]['l'].apply(lambda x: row_proc(x))

568 ms ± 17.6 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [74]:
distances

[1.0, 1.5, 3.6, 1.0, 1.0, 1.8, 1.0, 1.8, 5.8, 1.2]

In [75]:
durations

[1.0, 6.0, 8.0, 1.0, 1.0, 9.0, 1.0, 5.0, 17.0, 5.0]

In [76]:
dests

[['2420 West St, Brooklyn, NY 11223, USA'],
 ['2027 E 2nd St, Brooklyn, NY 11223, USA'],
 ['3 Terminal Way, Avenel, NJ 07001, USA'],
 ['Waterfront Fitness Trail, Carteret, NJ 07008, USA'],
 ['1774 64th St, Brooklyn, NY 11204, USA'],
 ['1762 46th St, Brooklyn, NY 11204, USA'],
 ['1762 46th St, Brooklyn, NY 11204, USA'],
 ['1243 E 95th St, Brooklyn, NY 11236, USA'],
 ['17 Lacon Ct, Brooklyn, NY 11229, USA'],
 ['1806 E 92nd St, Brooklyn, NY 11236, USA']]