# GeoProcessing:

## DBScan: Cluster Analysis of every user's location
## Speed Processing (Check if a user's reported speeds are even possible!)

Using DBScan, find spatial clusters relevant to each user, typically home, work, or school

In [1]:
import os, json, matplotlib, iso8601
from shapely.geometry import shape
from multiprocessing import Pool; 
import numpy as np; import pandas as pd; import geopandas as gpd
from dbscan_python import dbscan
from geodistance import geodistance
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
users_with_tweets_in_zone_a = sorted(os.listdir('../working_data/all_tweets_from_zoneA_users/'))
len(users_with_tweets_in_zone_a)

21951

In [3]:
"""
    Input: User .geojsonl file
    Returns: DataFrame with speed

    #Process
    1. Loads all of a user's tweets (parseable json per line, parse, strip down to details, sort by time)
    3. Computes time, distance, and speed between each tweet
    4. Puts it all into a DataFrame
"""

def worker_function(user_geojsonl_file):
#     EPS     = 100     #Max. Distance for points in the cluster...                                                                                                                   
#     MIN_PTS = 5       #Minimum Points in a cluster  
    tweets = []
    for line in open('../working_data/all_tweets_from_zoneA_users/'+user_geojsonl_file,'r'):
        t = json.loads(line.strip())
        stripped = {
                'geometry': shape(t['geometry']),
                'coords'  : t['geometry']['coordinates'],
                'date'    : iso8601.parse_date(t['properties']['postedTime']),
                'text'    : t['properties']['body'],
                'user'    : t['properties']['actor']['preferredUsername']
            }
        tweets.append(stripped)
    if len(tweets)==1:
        return None
    #Ensure we're sorted by time:
    tweets.sort(key=lambda t: t['date'])

#     #Clustering:
#     points = [t['coords'] for t in tweets]
#     m = np.array([ [p[1] for p in points], [p[0] for p in points] ])
#     clusters = dbscan.dbscan(m, EPS, MIN_PTS)
    
#     #Iterate through, assign clusters & calculate distance
#     for idx, t in enumerate(tweets):
#         t['cluster'] = clusters[idx]
#         if idx>0:
#             p1 = t['geometry']
#             p2 = tweets[idx-1]['geometry']
#             t['geo_delta'] = geodistance.distanceHaversine(p1.y,p1.x,p2.y,p2.x)[0]*1000
    
    #Now we can finally do the dataframe bullshit...
    df = gpd.GeoDataFrame(tweets)
    df['time_delta'] = df['date'].diff()
    df['speed'] = df.apply(lambda row: row['geo_delta']/ (row['time_delta'] / np.timedelta64(1, 's')), axis=1)
    return df

In [5]:
#Parallel runtime
from __future__ import division    
pool = Pool(24)
for i, _ in enumerate(pool.imap_unordered(worker_function, users_with_tweets_in_zone_a), 1):
    if i%100==0:
        print(str(i)+"..",end="")
pool.close()
pool.join()

KeyError: ('geo_delta', 'occurred at index 0')

Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/util.py", line 185, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr/lib/python3.4/multiprocessing/pool.py", line 531, in _terminate_pool
    outqueue.put(None)                  # sentinel
  File "/usr/lib/python3.4/multiprocessing/queues.py", line 366, in put
    with self._wlock:
  File "/usr/lib/python3.4/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt: 
Process ForkPoolWorker-34:
Process ForkPoolWorker-46:
Process ForkPoolWorker-43:
Process ForkPoolWorker-44:
Process ForkPoolWorker-37:
Process ForkPoolWorker-36:
Process ForkPoolWorker-40:
Process ForkPoolWorker-45:
Process ForkPoolWorker-25:
Process ForkPoolWorker-35:
Process ForkPoolWorker-31:
Process ForkPoolWorker-39:
Process ForkPoolWorker-41:
Process ForkPoolWorker-32:
Process ForkPoolWorker-30:
Process ForkPoolWor

In [None]:
user_dfs = res.get()

In [None]:
len(user_dfs)

In [None]:
dfs = [df for df in user_dfs if type(df) != type(None) ]

In [None]:
len(dfs)

In [None]:
# Will Move this to temporal processing after we can serialize these dataframes better :) 
def check_before_and_after(df):
    _before   = datetime.datetime(2012,10,29,8,0,0,tzinfo=pytz.UTC) #Noon
    _landfall = datetime.datetime(2012,10,30,0,0,0,tzinfo=pytz.UTC) #8pm EST
    _after    = datetime.datetime(2012,10,31,0,0,0,tzinfo=pytz.UTC) #8pm EST
    
    tweets_during_landfall_p1 = df[df['date' > _before]]
    tweets_during_landfall    = tweets_during_landfall_p1[tweets_during_landfall_p1['date'] < _after]
    
    if len(tweets_during_landfall) > 1:
        return tweets_during_landfall
    else:
        return None
check_before_and_after(dfs[0])

In [None]:
#Run it all in parallel?
#http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call
# pool = Pool(24)
# res = pool.map_async(check_before_and_after, dfs)
# pool.close()
# pool.join()

In [None]:
user_dfs = res2.get()
dfs = [df for df in user_dfs if type(df) != type(None) ]

In [None]:
# Get these usernames into a CSV 
# default_handler #should be able to call the .to_json() method with this argument to serialize the dates?