# GeoProcessing:

## DBScan: Cluster Analysis of every user's location
## Speed Processing (Check if a user's reported speeds are even possible!)

Using DBScan, find spatial clusters relevant to each user, typically home, work, or school

In [1]:
import os, json, matplotlib, iso8601, sys, time, datetime, pytz
from shapely.geometry import shape
from multiprocessing import Pool, Manager;
import numpy as np; import pandas as pd; import geopandas as gpd
from dbscan_python import dbscan
from geodistance import geodistance
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
users_with_tweets_in_zone_a = sorted(os.listdir('../working_data/all_tweets_from_zoneA_users/'))
len(users_with_tweets_in_zone_a)

21951

In [4]:
"""
    Input: User .geojsonl file
    Returns: DataFrame with speed & clusters

    #Process
    1. Loads all of a user's tweets (parseable json per line, parse, strip down to details, sort by time)
    2. Computes Clusters based on below EPS & MIN_PTS
    3. Computes time, distance, and speed between each tweet
    4. Puts it all into a DataFrame

"""

def worker_function(args):

    user_geojsonl_file, q = args

    EPS     = 100     #Max. Distance for points in the cluster... (In meters)                                                                                                                  
    MIN_PTS = 3       #Minimum Points in a cluster...
    tweets = []
    for line in open('../working_data/all_tweets_from_zoneA_users/'+user_geojsonl_file,'r'):
        t = json.loads(line.strip())
        stripped = {
                'geometry': shape(t['geometry']),
                'coords'  : t['geometry']['coordinates'],
                'date'    : iso8601.parse_date(t['properties']['postedTime']),
                'text'    : t['properties']['body'],
                'user'    : t['properties']['actor']['preferredUsername']
            }
        tweets.append(stripped)
    if len(tweets)==1:
        q.put(1)
        return None
    #Ensure we're sorted by time:
    tweets.sort(key=lambda t: t['date'])

    #Clustering:
    points = [t['coords'] for t in tweets]
    m = np.matrix([ [p[0] for p in points], [p[1] for p in points] ])
#     print(m)
    clusters = dbscan.dbscan(m, EPS, MIN_PTS)
    
    #Iterate through, assign clusters & calculate distance
    for idx, t in enumerate(tweets):
        t['cluster'] = clusters[idx]
        if idx>0:
            p1 = t['geometry']
            p2 = tweets[idx-1]['geometry']
            t['geo_delta'] = geodistance.distanceHaversine(p1.y,p1.x,p2.y,p2.x)[0]*1000
    
    #Now we can finally do the dataframe bullshit...
    df = gpd.GeoDataFrame(tweets)

    # If clusters were all None, then return nothing, it couldn't cluster!
    if len(df.cluster.value_counts()) < 1:
        q.put(2)
        return None
    df['time_delta'] = df['date'].diff()
    df['speed'] = df.apply(lambda row: row['geo_delta']/ (row['time_delta'] / np.timedelta64(1, 's')), axis=1)
    q.put(0)
    return df

In [None]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, q) for i in users_with_tweets_in_zone_a]
result = p.map_async(worker_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        with open("load.log",'w') as log:
            log.write("\rProcessed: {0}, {1:.3g}%\n".format(size, size/len(args)*100))
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(2)

values = result.get()
users = [i for i in values if i is not None]
nones = [i for i in values if i is None]
p.close()

Processed: 3431, 15.6%

In [None]:
len(users)

In [None]:
#Check that the GeoDataFrame is working...
users[0].plot()

In [None]:
os.mkdir('../working_data/clustered_three_pts_with_speed_2/')

In [None]:
def safe_json_export(args):
    path = '../working_data/clustered_three_pts_with_speed_2/'
    df, q = args
    df = df.copy()
    df['date'] = df['date'].apply(lambda t: datetime.datetime.strftime(t,'%Y-%m-%dT%H:%M:%SZ'))
    df['time_delta'] = df['time_delta'] / np.timedelta64(1, 's')
    uName = df.head(1).user.values[0].lower()
    with open(path+uName+'.geojson','w') as oFile:
        oFile.write(df.to_json(ensure_ascii=False))
    q.put(1)

In [None]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, q) for i in users]
result = p.map_async(safe_json_export, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
p.close()

Scrap Below... 

---

In [21]:
# Will Move this to temporal processing after we can serialize these dataframes better (DONE) :) 
def check_before_and_after(df):
    _before   = datetime.datetime(2012,10,29,8,0,0,tzinfo=pytz.UTC) #Noon
    _landfall = datetime.datetime(2012,10,30,0,0,0,tzinfo=pytz.UTC) #8pm EST
    _after    = datetime.datetime(2012,10,31,0,0,0,tzinfo=pytz.UTC) #8pm EST
    
    tweets_during_landfall_p1 = df[df['date'] > _before]
    tweets_during_landfall    = tweets_during_landfall_p1[tweets_during_landfall_p1['date'] < _after]
    
    if len(tweets_during_landfall) > 1:
        return tweets_during_landfall
    else:
        return None
check_before_and_after(dfs[0])

In [22]:
atLandfall = [check_before_and_after(x) for x in dfs]

In [25]:
atLandfallReal = [x for x in atLandfall if type(x) != type(None)]

In [29]:
landfallers = sorted(atLandfallReal, key=lambda x: len(x), reverse=True)