In [1]:
# notebook setup
import datetime as dt
import os
import sys

from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd

sys.path.insert(0, os.path.abspath('../..'))

import gps

InteractiveShell.ast_node_interactivity = 'all'

# bottom left, top right of the greater Seattle area
min_lat, min_lon = 47.324739, -122.440531
max_lat, max_lon = 48.100469, -121.850196

dbscan_p1_params = dict(
    min_samples=155,
    eps=.005,
    n_jobs=3
)

df = pd.read_csv('some_data.csv', parse_dates=['ts'])
df.head()

  return f(*args, **kwds)
  return f(*args, **kwds)


Welcome, Luke Waninger!

Welcome, Luke Waninger!



Unnamed: 0,lat,lon,ts
0,47.66133,-122.29343,2018-06-07 00:00:08.668
1,47.66123,-122.29655,2018-06-07 00:00:24.000
2,47.66125,-122.29673,2018-06-07 00:00:25.000
3,47.66087,-122.29904,2018-06-07 00:00:40.000
4,47.66067,-122.29951,2018-06-07 00:00:56.000


## The GSCAP GPS scripts 
provide a number of  miscellaneous, clustering, and api methods.

### Miscellaneous

#### `as_pydate(str)`
Use this method for converting a number of various string representations of dates (all forms in BRIGHTEN raw data) into a Python datetime data type.

In [2]:
str_dates = ['28-07-16 05:29:20', '28-07-16 09:45:18']

for sd in str_dates:
    d = gps.as_pydate(sd)
    print(type(d), d.isoformat())

<class 'datetime.datetime'> 2016-07-28T05:29:20
<class 'datetime.datetime'> 2016-07-28T09:45:18


#### `isnum(x)`
A common procedure when processing raw data is converting strings to numbers. This method verifies the given x can be converted to a float datatype. This is most useful when converting within a list comprehension.

In [3]:
s = ['1', '-1.', 'a', '5.4', '5.a']
t = [float(x) if gps.isnum(x) else np.nan for x in s]; t

[1.0, -1.0, nan, 5.4, nan]

#### `dd_from_zip(zipcode)` and `zip_from_dd(lat, lon)`
A mapping table of zipcodes to latitudes and longitudes is downloaded from the GSCAP source when the script begins. You can take advantage of these two conversion methods to go back and forth between the two. One thing to remember when using these conversion is that you may incur resolution loss. For example, an infinite amount of lat, lon pairs exist within any given zipcode boundary. When converting from lat, lon to zip you lose that resolution. When coming back from zip to lat, lon you'll get the center point for the given zipcode, which no longer has the resolution you would need for performing reliable clustering algorithms or API lookups.

In [4]:
zips = [98115, 97236, 47579]
dds = [gps.dd_from_zip(zc) for zc in zips]
dds

del zips
zips = [gps.zip_from_dd(*coordinate_pair) for coordinate_pair in dds]
zips

[(47.68575, -122.28159), (45.48321, -122.51051), (38.11094, -86.91513)]

[98115, 97236, 47579]

### Stream processing

A number of functions are available for preprocessing the GPS stream. All methods taking `gps_records` require the records to be given as a list of `gps.GPS` where as methods taking `records` need a Pandas dataframe containing three collumns: lat, lon, and ts where ts is the timestamp. `gps.GPS` is a namedtuple defined in gps.py and contains the same three fields: lat, lon, and ts.

#### `resample_gps_intervals(records)`
Currently, this method only resamples at one-minute intervals taking the mean of each lat, long within the interval. The timestamp returned is the minute for which the sampling occurred.

In [5]:
records = pd.DataFrame([
    (47.10, 122.10, dt.datetime.now()),
    (47.11, 122.11, dt.datetime.now()-dt.timedelta(seconds=15)),
    (47.12, 122.12, dt.datetime.now()-dt.timedelta(seconds=30)),
    (47.13, 122.13, dt.datetime.now()-dt.timedelta(seconds=65)),
], columns=['lat', 'lon', 'ts'])

gps.resample_gps_intervals(records)

Unnamed: 0,lat,lon,ts
0,47.13,122.13,2018-11-11 09:51:00
1,47.11,122.11,2018-11-11 09:52:00


#### `impute_stationary_coordinates(records, freq, metrics)` 
This method can be used to upsample the number of points in a stationary location. This is used to account for specific data collection methods that don't take samples if the research participant hasn't moved location. Additionally, this helps when using density based clustering techniques to find a participants frequently visited locations.
* freq - is a given interval as defined in the Pandas timeseries methods. Read more in the [Pandas Docs](https://pandas.pydata.org/pandas-docs/stable/timeseries.html)
* metrics - is a boolean value which tells the function whether or not to return the records with the velocity metrics used to locate stationary positions.

In [6]:
records = pd.DataFrame([
    (47.10, 122.10, dt.datetime.now()),
    (47.10, 122.10, dt.datetime.now()-dt.timedelta(minutes=30)),
], columns=['lat', 'lon', 'ts'])

gps.impute_stationary_coordinates(records, freq='10Min', metrics=False)
gps.impute_stationary_coordinates(records, freq='5Min', metrics=True)

Unnamed: 0,lat,lon,ts
0,47.1,122.1,2018-11-11 09:22:33.036198
1,47.1,122.1,2018-11-11 09:32:33.036198
2,47.1,122.1,2018-11-11 09:42:33.036198
3,47.1,122.1,2018-11-11 09:52:33.036190


Unnamed: 0,binning,displacement,lat,lon,time_delta,ts,velocity
0,stationary,,47.1,122.1,,2018-11-11 09:22:33.036198,
1,stationary,0.0,47.1,122.1,300.0,2018-11-11 09:27:33.036198,0.0
2,stationary,0.0,47.1,122.1,300.0,2018-11-11 09:32:33.036198,0.0
3,stationary,0.0,47.1,122.1,300.0,2018-11-11 09:37:33.036198,0.0
4,stationary,0.0,47.1,122.1,300.0,2018-11-11 09:42:33.036198,0.0
5,stationary,0.0,47.1,122.1,300.0,2018-11-11 09:47:33.036198,0.0
6,stationary,0.0,47.1,122.1,299.0,2018-11-11 09:52:33.036190,0.0


#### `geo_distance(lat1, lon1, lat2, lon2)` 
Computes the geographic distance in meters between two latitude, longitude pairs.

In [7]:
pair1 = (47.1, 122.1)
pair2 = (47.2, 122.2)

gps.geo_distance(*pair1, *pair2)

13447.27857197738

#### `geo_pairwise_distances(x, n_jobs)`
Accepts a list of coordinate pairs and computes the pairwise geographic distance between each pair. Set `n_jobs` to -1 to use all available cores.

In [8]:
x = [
    (47.111, 122), 
    (47.112, 122),
    (47.113, 122)
]

gps.geo_pairwise_distances(x, as_array=False, n_jobs=-1)
gps.geo_pairwise_distances(x, as_array=True, n_jobs=-1)

Unnamed: 0,lat1,lon1,lat2,lon2,distance
0,47.111,122,47.112,122,111.2
1,47.111,122,47.113,122,222.4
2,47.112,122,47.113,122,111.2


array([111.2, 111.2, 222.4])

#### `discrete_velocity(coordinate_a, coordinate_b)`
Calculate the velocity binning between two coordinate pairs.

In [9]:
now, td = dt.datetime.now(), dt.timedelta(minutes=1)
lat, lon = -33.8666199, 151.1958527
coordinate_pairs = []

a, b = (lat, lon, now), (lat + 0.0001, lon + 0.0001, now+td)
coordinate_pairs.append(a)
coordinate_pairs.append(b)
gps.discrete_velocity(a, b)

a, b = (lat, lon, now), (lat + 0.001, lon + 0.001, now+td)
coordinate_pairs.append(a)
coordinate_pairs.append(b)
gps.discrete_velocity(a, b)

a, b = (lat, lon, now), (lat + 0.01, lon + 0.01, now+td)
coordinate_pairs.append(a)
coordinate_pairs.append(b)
gps.discrete_velocity(a, b)

a, b = (lat, lon, now), (lat + 0.1, lon + 0.1, now+td)
coordinate_pairs.append(a)
coordinate_pairs.append(b)
gps.discrete_velocity(a, b)

a, b = (lat, lon, now), (lat + 1, lon + 1, now+td)
coordinate_pairs.append(a)
coordinate_pairs.append(b)
gps.discrete_velocity(a, b)

{'displacement': 14.5,
 'time_delta': 60,
 'velocity': 0.241,
 'binning': 'stationary'}

{'displacement': 144.5,
 'time_delta': 60,
 'velocity': 2.409,
 'binning': 'active'}

{'displacement': 1445.3,
 'time_delta': 60,
 'velocity': 24.089,
 'binning': 'powered_vehicle'}

{'displacement': 14456.5,
 'time_delta': 60,
 'velocity': 240.941,
 'binning': 'high_speed_transportation'}

{'displacement': 144872.1,
 'time_delta': 60,
 'velocity': 2414.535,
 'binning': 'anomaly'}

#### `process_velocities(gps_records)`
Use this to calculate distance, time_deltas, and velocities - continuous and semantic - between a list of gps records.

In [10]:
records = pd.DataFrame(coordinate_pairs, columns=['lat', 'lon', 'ts'])
gps.process_velocities(records)

Unnamed: 0,lat,lon,ts,binning,displacement,time_delta,velocity
0,-33.86662,151.195853,2018-11-11 09:52:34.294985,stationary,,,
1,-33.86652,151.195953,2018-11-11 09:53:34.294985,stationary,14.5,60.0,0.241
2,-33.86662,151.195853,2018-11-11 09:52:34.294985,stationary,14.5,60.0,0.241
3,-33.86562,151.196853,2018-11-11 09:53:34.294985,active,144.5,60.0,2.409
4,-33.86662,151.195853,2018-11-11 09:52:34.294985,active,144.5,60.0,2.409
5,-33.85662,151.205853,2018-11-11 09:53:34.294985,powered_vehicle,1445.3,60.0,24.089
6,-33.86662,151.195853,2018-11-11 09:52:34.294985,powered_vehicle,1445.3,60.0,24.089
7,-33.76662,151.295853,2018-11-11 09:53:34.294985,high_speed_transportation,14456.5,60.0,240.941
8,-33.86662,151.195853,2018-11-11 09:52:34.294985,high_speed_transportation,14456.5,60.0,240.941
9,-32.86662,152.195853,2018-11-11 09:53:34.294985,anomaly,144872.1,60.0,2414.535


### Clustering

#### `gps_dbscan(gps_records, parameters)`
This function is used to perform the unsupervised clustering algorithm DBSCAN and the provided records. Parameters should be a dict and contain both min_samples and eps. See the [scikit-learn docs](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) for more information.

In [11]:
gps_records = [
    gps.GPS(t.lat, t.lon, t.ts) for t in df.itertuples()
]

labels, clusters = gps.gps_dbscan(gps_records, dbscan_p1_params)

labels
pd.DataFrame(clusters)

array([-1, -1, -1, ...,  0,  0,  0])

Unnamed: 0,cid,lat,lat_IQR,lat_max,lat_min,lat_range,lat_std,lon,lon_IQR,lon_max,lon_min,lon_range,lon_std,max_distance_from_center
0,0,47.67723,0.00018,47.68021,47.6695,0.01071,0.00144,-122.31961,0.00241,-122.31308,-122.32822,0.01514,0.00117,884.632
1,1,45.5646,0.00061,45.56508,45.56088,0.0042,0.00043,-122.53376,0.00036,-122.53304,-122.53645,0.00341,0.00037,464.019
2,2,45.49465,0.00053,45.50092,45.48952,0.0114,0.00098,-122.47981,0.00041,-122.47438,-122.48696,0.01258,0.00156,776.458


#### `estimate_home_location(records, parameters)`
This method takes the same parameters as gps_dbscan above but only uses the points with timestamps between midnight and 6am for the clustering. Returns two variables, the first is the cluster and second is the list of indices assigned to the home location.

In [12]:
home, idx = gps.estimate_home_location(df, dbscan_p1_params)

pd.DataFrame(home, index=[1])
idx[:10]

Unnamed: 0,lat,lon,cid,lat_range,lat_IQR,lat_min,lat_max,lat_std,lon_range,lon_IQR,lon_min,lon_max,lon_std,max_distance_from_center
1,47.67723,-122.31961,0,0.01071,0.00018,47.6695,47.68021,0.00144,0.01514,0.00241,-122.32822,-122.31308,0.00117,884.632


[582, 583, 584, 585, 586, 587, 588, 589, 590, 591]

#### `estimate_work_location(records, parameters)`
This method takes the same parameters as gps_dbscan above but only uses the points with timestamps between 9am and 5pm during weekdays for the clustering. Returns two variables, the first is the cluster and second is the list of indices assigned to the work location.

In [13]:
work, idx = gps.estimate_work_location(df, dbscan_p1_params)

pd.DataFrame(work, index=[1])
idx[:10]

Unnamed: 0,lat,lon,cid,lat_range,lat_IQR,lat_min,lat_max,lat_std,lon_range,lon_IQR,lon_min,lon_max,lon_std,max_distance_from_center
1,47.67764,-122.31958,0,0.00353,0.00042,47.67441,47.67794,0.00045,0.0046,0.00145,-122.32264,-122.31804,0.00034,369.511


[601, 602, 603, 604, 605, 606, 607, 608, 609, 610]

#### `get_clusters_with_context(records, parameters)`
Again, this takes the same arguments as above.  This method partitions the records into three sets - home, work, and everything else. - and performs the separate clustering on each partition. Two variables are returned, the records supplied with an additional row assigning cluster cids and the clusters themselves.

In [14]:
records = gps.process_velocities(df)
records, clusters = gps.get_clusters_with_context(records, dbscan_p1_params)

records.head()
clusters

Unnamed: 0,binning,cid,displacement,lat,lon,time_delta,ts,velocity,distance_from_home
585,stationary,home,26.7,47.67695,-122.32147,122.0,2018-06-07 08:18:05.846,0.219,152.912
587,stationary,home,42.3,47.6779,-122.31966,121.0,2018-06-07 08:22:07.862,0.349,58.635
588,stationary,home,14.5,47.67777,-122.31966,133.0,2018-06-07 08:24:21.799,0.109,44.445
589,stationary,home,4.0,47.67774,-122.31963,121.0,2018-06-07 08:26:22.830,0.033,40.724
590,stationary,home,0.0,47.67774,-122.31963,161.0,2018-06-07 08:29:04.764,0.0,40.724


Unnamed: 0,cid,lat,lon,name,categories
0,x0,45.49469,-122.47984,nap,nap
1,home,47.67738,-122.31953,home,home
2,work,47.67769,-122.31957,work,work


####  `get_next_phase_clusters(records, clusters, params)`
To find clusters with less prominence you may want to do a second round of clustering with the remaining unlabeled, stationary points. 

In [15]:
dbscan_p2_params = dict(
    min_samples=75,
    eps=.01,
    n_jobs=3
)

records, clusters = gps.get_next_phase_clusters(records, clusters, dbscan_p2_params, min_distance=1)
clusters

Unnamed: 0,cid,lat,lon,name,categories
0,x0,45.49469,-122.47984,nap,nap
1,home,47.67738,-122.31953,home,home
2,work,47.67769,-122.31957,work,work
0,x1,47.54944,-121.99346,nap,nap
0,x2,45.51253,-122.67882,nap,nap
0,x3,45.50082,-122.67771,nap,nap
0,x4,45.5646,-122.53374,nap,nap


### Place lookup

The place methods are currently built to allow calls to either Yelp or Google for contextual lookup. Both are processed via the same call and require either to be given with in the form of a list of `gps.PlaceRequest`. Each request is cached to a backend sqlite file so that the same request doesn't have to be submitted more than once. Results are returned a dictionary containing three keys:
1. `hits` tells you how many of your requests were returned from the cache
2. `misses` tells you how many went through the API endpoint 
3. `request` is a dataframe of the resulting places

An API key must be supplied with each request as they are never cached within the GPS module.

An enum is implemented within the gps module that allows easy selection of an API source. The possibilities are: 
1. `gps.ApiSource.YELP` 
2. `gps.ApiSource.GOOGLE'

With either, a value for `rankby` should also be supplied.  For Google, only prominence is currently implemented - `gps.GmapsRankBy.PROMINENCE`. A number of options are available for Yelp:
* `gps.YelpRankBy.BEST_MATCH`
* `gps.YelpRankBy.RATING`
* `gps.YelpRankBy.REVIEW_COUNT`
* `gps.YelpRankBy.DISTANCE`

#### `request_nearby_places(request, n_jobs, force, progress_qu)`
* `n_jobs`, as always, specificies the number of cores to use of simultaneous calls.
* set `force` to true in order to make the request and overwrite whatever may exist in the cache.
* supply a multiprocessing.Queue from the utils.progress_qu method as the progress_qu in order to track completion.

In [16]:
yelp_key = ''

# setup a list of requests to query the nearby places from 
# Yelp within a range of 50 meters and order by the BEST_MATCH
request = [
    gps.PlaceRequest(
        lat=t.lat, lon=t.lon, radius=50, 
        source=gps.ApiSource.YELP,
        rankby=gps.YelpRankBy.BEST_MATCH,
        key=yelp_key
    )
    for t in clusters.loc[
        (clusters.cid != 'home') & (clusters.cid != 'work')
    ].itertuples()
]

results = gps.request_nearby_places(request)
print(f'hits: {results["hits"]}, misses: {results["misses"]}')
results['request']

hits: 5, misses: 0


Unnamed: 0,dtRetrieved,lat,lon,radius,source,name,rank_order,categories,major_categories
0,2018-11-11 09:52:43.704567,45.49469,-122.47984,50.0,Yelp,not found,-1.0,none,none
1,2018-11-11 09:52:43.708266,47.54944,-121.99346,50.0,Yelp,not found,-1.0,none,none
2,2018-11-11 09:52:43.717772,45.51253,-122.67882,50.0,Yelp,Keller Fountain Park,0.0,parks,park
3,2018-11-11 09:52:43.720934,45.50082,-122.67771,50.0,Yelp,not found,-1.0,none,none
4,2018-11-11 09:52:43.729173,45.5646,-122.53374,50.0,Yelp,Starbucks,0.0,coffee,dining_out


In [19]:
gmaps_key = ''

# setup a list of requests to query the nearby places from 
# Google within a range of 50 meters and order by the PROMINENCE
request = [
    gps.PlaceRequest(
        lat=t.lat, lon=t.lon, radius=50, 
        source=gps.ApiSource.GMAPS,
        rankby=gps.GmapsRankBy.PROMINENCE,
        key=gmaps_key
    )
    for t in clusters.loc[
        (clusters.cid != 'home') & (clusters.cid != 'work')
    ].itertuples()
]

results = gps.request_nearby_places(request)

print(f'hits: {results["hits"]}, misses: {results["misses"]}')
results['request']

hits: 5, misses: 0


Unnamed: 0,dtRetrieved,lat,lon,radius,source,name,rank_order,categories,major_categories
0,2018-11-11 09:54:04.076878,45.49469,-122.47984,50.0,Google Places,Peregrin Co,1.0,finance,finance
1,2018-11-11 09:54:04.080093,47.54944,-121.99346,50.0,Google Places,not found,-1.0,none,none
2,2018-11-11 09:54:04.087619,45.51253,-122.67882,50.0,Google Places,Keller Auditorium,1.0,other,other
3,2018-11-11 09:54:04.090122,45.50082,-122.67771,50.0,Google Places,not found,-1.0,none,none
4,2018-11-11 09:54:04.096825,45.5646,-122.53374,50.0,Google Places,Specialty Building Products NW,1.0,general_contractor,repair


### Clustering and daily features

#### `get_cluster_times(records, clusters)`
Calculates daily entry and exit times a research participant makes into their defined clusters.

In [20]:
entries = gps.get_cluster_times(records, clusters)
entries.head()

Unnamed: 0,cid,lat,lon,time_in,time_out,duration,midpoint,date,tod,tod_bin,cname,category
0,x1,47.54944,-121.99346,2018-06-07 04:13:16.587,2018-06-07 07:23:47.797,03:10:31.210000,2018-06-07 05:48:32.192000,2018-06-07,05:48:32.192000,early_morning,nap,nap
1,home,47.67738,-122.31953,2018-06-07 08:22:07.862,2018-06-07 09:00:00.009,00:37:52.147000,2018-06-07 08:41:03.935500,2018-06-07,08:41:03.935500,early_morning,home,home
2,work,47.67769,-122.31957,2018-06-07 09:05:34.984,2018-06-07 09:07:16.824,00:01:41.840000,2018-06-07 09:06:25.904000,2018-06-07,09:06:25.904000,morning,work,work
3,home,47.67738,-122.31953,2018-06-07 09:52:56.891,2018-06-07 10:32:11.729,00:39:14.838000,2018-06-07 10:12:34.310000,2018-06-07,10:12:34.310000,morning,home,home
4,work,47.67769,-122.31957,2018-06-07 11:15:03.200,2018-06-07 11:53:16.910,00:38:13.710000,2018-06-07 11:34:10.055000,2018-06-07,11:34:10.055000,morning,work,work


#### `get_daily_metrics(records, entries)`
Uses the records set and  daily features

In [21]:
daily_metrics = gps.get_daily_metrics(records, entries)
daily_metrics.head()

Unnamed: 0,came_to_work,date,distance_brunch,distance_high_speed_transportation,distance_powered_vehicle,distance_walking,hours_accounted_for,hours_at_home,hours_at_work,hours_brunch,...,hours_of_sleep,hours_powered_vehicle,hours_spent_in_top_3_clusters,hours_stationary,hours_stationary_non_home_work,hours_walking,hours_without_data,location_variance,max_distance_from_home,number_of_clusters
0,True,2018-06-07,0.0,34906.5,339727.4,3752.0,24,2.745,1.673,0.0,...,,3.521,0.0,23.612,7.11,1.024,0,-1.081595,73764.184,3
1,True,2018-06-08,0.0,0.0,12224.0,1515.0,23,7.686,0.597,0.0,...,0.6,0.335,0.0,30.377,4.865,0.393,1,-3.872976,3090.939,2
2,False,2018-06-09,0.0,0.0,1001.1,226.0,21,18.689,0.0,0.0,...,0.2,0.014,0.0,23.951,0.0,0.064,3,-6.115475,547.313,1
3,False,2018-06-10,0.0,0.0,276.3,2297.0,17,22.781,0.0,0.0,...,0.317,0.006,0.0,23.359,0.0,0.551,7,-5.467346,758.101,1
4,True,2018-06-11,0.0,3986.0,78507.9,476.0,22,12.278,1.784,0.0,...,,1.045,0.0,30.463,1.775,0.127,2,-1.520118,32636.472,2
