# GeoProcessing:

## DBScan: Cluster Analysis of every user's location
## Speed Processing (Check if a user's reported speeds are even possible!)

Using DBScan, find spatial clusters relevant to each user, typically home, work, or school

In [239]:
import os, json, matplotlib, iso8601, sys
from shapely.geometry import shape
from multiprocessing import Pool; 
import numpy as np; import pandas as pd; import geopandas as gpd
from dbscan_python import dbscan
from geodistance import geodistance
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
users_with_tweets_in_zone_a = sorted(os.listdir('../working_data/all_tweets_from_zoneA_users/'))
len(users_with_tweets_in_zone_a)

21951

In [4]:
"""
    Input: User .geojsonl file
    Returns: DataFrame with speed & clusters

    #Process
    1. Loads all of a user's tweets (parseable json per line, parse, strip down to details, sort by time)
    2. Computes Clusters based on below EPS & MIN_PTS
    3. Computes time, distance, and speed between each tweet
    4. Puts it all into a DataFrame
"""

def worker_function(user_geojsonl_file):
    EPS     = 100     #Max. Distance for points in the cluster...                                                                                                                   
    MIN_PTS = 5       #Minimum Points in a cluster  
    tweets = []
    for line in open('../working_data/all_tweets_from_zoneA_users/'+user_geojsonl_file,'r'):
        t = json.loads(line.strip())
        stripped = {
                'geometry': shape(t['geometry']),
                'coords'  : t['geometry']['coordinates'],
                'date'    : iso8601.parse_date(t['properties']['postedTime']),
                'text'    : t['properties']['body'],
                'user'    : t['properties']['actor']['preferredUsername']
            }
        tweets.append(stripped)
    if len(tweets)==1:
        return None
    #Ensure we're sorted by time:
    tweets.sort(key=lambda t: t['date'])

    #Clustering:
    points = [t['coords'] for t in tweets]
    m = np.array([ [p[1] for p in points], [p[0] for p in points] ])
    clusters = dbscan.dbscan(m, EPS, MIN_PTS)
    
    #Iterate through, assign clusters & calculate distance
    for idx, t in enumerate(tweets):
        t['cluster'] = clusters[idx]
        if idx>0:
            p1 = t['geometry']
            p2 = tweets[idx-1]['geometry']
            t['geo_delta'] = geodistance.distanceHaversine(p1.y,p1.x,p2.y,p2.x)[0]*1000
    
    #Now we can finally do the dataframe bullshit...
    df = gpd.GeoDataFrame(tweets)
    df['time_delta'] = df['date'].diff()
    df['speed'] = df.apply(lambda row: row['geo_delta']/ (row['time_delta'] / np.timedelta64(1, 's')), axis=1)
    return df

In [5]:
#Parallel runtime
pool = Pool(24)
res = pool.map_async(worker_function, users_with_tweets_in_zone_a)
pool.close()
pool.join()

In [6]:
user_dfs = res.get()

In [7]:
len(user_dfs)

21951

In [227]:
dfs = [df for df in user_dfs if type(df) != type(None) ]

In [228]:
len(dfs)

20533

In [235]:
def safe_json_export(df):
    path = '../working_data/clustered_with_speed/'
    df = df.copy()
    df['date'] = df['date'].apply(lambda t: datetime.datetime.strftime(t,'%Y-%m-%dT%H:%M:%SZ'))
    df['time_delta'] = df['time_delta'] / np.timedelta64(1, 's')
    uName = df.head(1).user.values[0].lower()
    with open(path+uName+'.geojson','w') as oFile:
        oFile.write(df.to_json(ensure_ascii=False))

In [240]:
#Parallel runtime
pool = Pool(24)
for i, _ in enumerate(pool.imap_unordered(safe_json_export,  dfs[1:]), 1):
    sys.stderr.write('\rdone {0:%}'.format(i/len(dfs)))
pool.close()
pool.join()

done 99.995130%

In [15]:
import datetime, pytz

In [21]:
# Will Move this to temporal processing after we can serialize these dataframes better :) 
def check_before_and_after(df):
    _before   = datetime.datetime(2012,10,29,8,0,0,tzinfo=pytz.UTC) #Noon
    _landfall = datetime.datetime(2012,10,30,0,0,0,tzinfo=pytz.UTC) #8pm EST
    _after    = datetime.datetime(2012,10,31,0,0,0,tzinfo=pytz.UTC) #8pm EST
    
    tweets_during_landfall_p1 = df[df['date'] > _before]
    tweets_during_landfall    = tweets_during_landfall_p1[tweets_during_landfall_p1['date'] < _after]
    
    if len(tweets_during_landfall) > 1:
        return tweets_during_landfall
    else:
        return None
check_before_and_after(dfs[0])

In [22]:
atLandfall = [check_before_and_after(x) for x in dfs]

In [25]:
atLandfallReal = [x for x in atLandfall if type(x) != type(None)]

In [29]:
landfallers = sorted(atLandfallReal, key=lambda x: len(x), reverse=True)

In [54]:
landfallers[0].head(1).user.values[0]

'OhOhOhItsJoHnNy'

In [83]:
len(landfallers)

3289

In [57]:
with open('landfallers.csv','w') as oFile:
    for u in landfallers:
        oFile.write(u.head(1).user.values[0] + "," + str(len(u))+"\n")

In [79]:
x = landfallers[0]

In [82]:
impossible_speeds = []

In [85]:
for u in landfallers:
    if len(u[u.speed > 240]) > 0:
        impossible_speeds.append(u)

In [89]:
impossible_speeds[1].sort(['speed'], ascending=False)

Unnamed: 0,cluster,coords,date,geo_delta,geometry,text,user,time_delta,speed
7678,1.0,"[-73.69155924, 40.67355939]",2012-10-29 19:32:38+00:00,43.469351,POINT (-73.69155924 40.67355939),@NWOY2J14 thanks Emmet always so helpful,Andrewthemark,00:00:00,inf
7617,1.0,"[-73.6914688, 40.67339842]",2012-10-29 16:42:39+00:00,2212.611105,POINT (-73.6914688 40.67339842),My parents haven't gone food shopping in two w...,Andrewthemark,00:00:37,59.800300
7683,1.0,"[-73.69156565, 40.67344721]",2012-10-29 19:43:30+00:00,1366.277394,POINT (-73.69156565 40.67344721),@yeahimkristin @nickredmachine thanks mom,Andrewthemark,00:00:39,35.032754
7616,,"[-73.70763115, 40.6577251]",2012-10-29 16:42:02+00:00,2211.322445,POINT (-73.70763115 40.6577251),@SpellingFucker nope. My dad was supposed to g...,Andrewthemark,00:01:09,32.048151
7681,1.0,"[-73.69156509, 40.67332976]",2012-10-29 19:39:43+00:00,991.237181,POINT (-73.69156509 40.67332976),@PSDoubleHoSeven it's a joke Parker,Andrewthemark,00:00:39,25.416338
7722,1.0,"[-73.69149302, 40.67348894]",2012-10-29 21:51:51+00:00,224.586776,POINT (-73.69149302 40.67348894),Welp. #BlackOut,Andrewthemark,00:00:11,20.416980
7682,1.0,"[-73.70452144, 40.68082512]",2012-10-29 19:42:51+00:00,1374.197222,POINT (-73.70452143999999 40.68082512),@HEELWrestling y joel y,Andrewthemark,00:03:08,7.309560
7679,1.0,"[-73.69839532, 40.66922998]",2012-10-29 19:34:32+00:00,751.095968,POINT (-73.69839532 40.66922998),@SpellingFucker maybe,Andrewthemark,00:01:54,6.588561
7710,1.0,"[-73.69197708, 40.6730437]",2012-10-29 21:07:07+00:00,56.586044,POINT (-73.69197708 40.6730437),@Figure4HedgeHog yes,Andrewthemark,00:00:15,3.772403
7653,1.0,"[-73.69135128, 40.67362364]",2012-10-29 17:40:42+00:00,74.125276,POINT (-73.69135128000001 40.67362364),Poor New Jersey #PrayForMark,Andrewthemark,00:00:22,3.369331


In [None]:
#Run it all in parallel?
#http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call
# pool = Pool(24)
# res = pool.map_async(check_before_and_after, dfs)
# pool.close()
# pool.join()

In [None]:
user_dfs = res2.get()
dfs = [df for df in user_dfs if type(df) != type(None) ]

In [None]:
# Get these usernames into a CSV 
# default_handler #should be able to call the .to_json() method with this argument to serialize the dates?

In [137]:
test = dfs[1].copy()

In [98]:
def to_str(x):
    return str(x)

In [183]:
test = dfs[1].copy()

In [194]:
test.text

0     I hate being in the same room of my brother &a...
1     Okay let's get this show on the road, hopefull...
2     Shit I'm to early, now I have to wait for the ...
3     Tired as hell just came from the parade, no af...
4     What are the odds of seeing your EX at  a labo...
5     I don't know what to do next, I want to go to ...
6     I get the best sleep when I'm faded, I wake up...
7                        @TheComplexOne88 hiiii buddy 👋
8     I miss interesting conversations, Thays ones y...
9                                    Porn = 👍👍👍....💤💤💤💤
10    @TheComplexOne88 I'm great just exhausted &amp...
Name: text, dtype: object