# Temporal Clustering

In [6]:
import psycopg2, multiprocessing, psycopg2.extras, os, json, sys, time
from multiprocessing import Pool, Manager 
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
working_data = '../working_data/clustered_with_speed/'

In [8]:
files = os.listdir(working_data)#[:100]

In [9]:
def loader_function(args):
    uFile, q = args
    u = json.load(open("../working_data/clustered_with_speed/"+uFile,'r'))
    tweets = []
    for t in u['features']:
        t['properties']['geometry'] = shape(t['geometry'])
        t['properties']['date'] = pd.Timestamp(t['properties']['date'])
        tweets.append(t['properties'])
    q.put(1)
    return gpd.GeoDataFrame(tweets)

In [13]:
#Parallel runtime
p = Pool(24)
m = Manager()
q = m.Queue()

args = [(i, q) for i in files]
result = p.map_async(loader_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.2g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
users = result.get()

Processed: 20533, 1%

In [544]:
len(users)

20533

# Identifying Temporal Clusters
Using the same DBScan algorithm, we can find the common tweeting times for every spatial cluster

In [24]:
import numpy as np
import dbscan, imp
imp.reload(dbscan)

<module 'dbscan' from '/home/anderstj/Twitter-Movement-Derivation/TimeProcessing/dbscan.py'>

In [26]:
def worker_function(args):
    userDF, q = args
    times = np.array([list(userDF.date.astype(np.int64).apply(lambda x: x//1000000000))])
    userDF['time_cluster'] = dbscan.dbscan(times, 4*3600, 2)
    q.put(1)
    return userDF

In [27]:
#Parallel runtime
p = Pool(24)
m = Manager()
q = m.Queue()

args = [(i, q) for i in users]
result = p.map_async(worker_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.2g}%".format(size, size/len(args)*100))
        time.sleep(0.5)

users = result.get()

Processed: 20533, 1e+02%

In [35]:
users[0]

Unnamed: 0,cluster,coords,date,geo_delta,geometry,speed,text,time_delta,user,time_cluster
0,,"[-73.95891598, 40.710655]",2012-09-01 00:55:56+00:00,,POINT (-73.95891598 40.710655),,Bobby Bigtimes's birthday dinner (@ Traif w/ 5...,,CarrinoAnthony,1.0
1,,"[-73.986764, 40.719607]",2012-09-05 23:21:52+00:00,2549.433529,POINT (-73.98676399999999 40.719607),0.00598,@HGTV Reunion. Hangin w/ @real_genevieve &amp;...,426356.0,CarrinoAnthony,2.0
2,,"[-73.99077645, 40.73639438]",2012-09-07 13:33:31+00:00,1897.0453,POINT (-73.99077645 40.73639438),0.013797,Kick'in off S3. So pumped to start shooting ne...,137499.0,CarrinoAnthony,3.0
3,,"[-74.04568997, 40.71875291]",2012-09-08 18:59:30+00:00,5025.955606,POINT (-74.04568997 40.71875291),0.047433,"Beers, Wings &amp; Football w/@cols6. Now all ...",105959.0,CarrinoAnthony,4.0
4,,"[-74.05938719, 40.72143496]",2012-09-09 16:10:14+00:00,1192.239339,POINT (-74.05938719 40.72143496),0.015637,Beautiful day! Taking some measurements on my ...,76244.0,CarrinoAnthony,5.0
5,,"[-73.98633448, 40.76004046]",2012-09-10 22:06:53+00:00,7503.798836,POINT (-73.98633448 40.76004046),0.069609,Opening night of @ChaplinBway w/@cols6 @real_g...,107799.0,CarrinoAnthony,6.0
6,,"[-73.87122, 40.773839]",2012-09-11 12:46:29+00:00,9815.137778,POINT (-73.87121999999999 40.773839),0.185977,"I'm at LaGuardia Airport (LGA) (East Elmhurst,...",52776.0,CarrinoAnthony,7.0
7,,"[-74.00150299, 40.70700073]",2012-09-11 14:13:17+00:00,13255.730781,POINT (-74.00150299000001 40.70700073),2.545263,Brooklyn Bridge enroute to LGA @ brooklyn bri...,5208.0,CarrinoAnthony,7.0
8,,"[-73.999191, 40.732381]",2012-09-13 01:28:25+00:00,2828.875408,POINT (-73.999191 40.732381),0.022291,Oh it has been waaaaay too long. (@ Babbo) htt...,126908.0,CarrinoAnthony,2.0
9,,"[-73.98842922, 40.71774704]",2012-09-15 21:44:13+00:00,1862.871564,POINT (-73.98842922 40.71774704),0.00758,Highly recommend checking out the #lowline exh...,245748.0,CarrinoAnthony,4.0


In [33]:
corrs = [u.corr() for u in users[:10]]

In [34]:
corrs

[              geo_delta     speed  time_delta  time_cluster
 geo_delta      1.000000 -0.002907    0.273859      0.078790
 speed         -0.002907  1.000000   -0.318680      0.221705
 time_delta     0.273859 -0.318680    1.000000     -0.020059
 time_cluster   0.078790  0.221705   -0.020059      1.000000,
             geo_delta     speed  time_delta
 geo_delta    1.000000  0.997830   -0.729878
 speed        0.997830  1.000000   -0.683282
 time_delta  -0.729878 -0.683282    1.000000,
             geo_delta     speed  time_delta
 geo_delta    1.000000  0.079296   -0.569180
 speed        0.079296  1.000000   -0.827751
 time_delta  -0.569180 -0.827751    1.000000,
             geo_delta     speed  time_delta
 geo_delta    1.000000  0.988160   -0.848646
 speed        0.988160  1.000000   -0.824503
 time_delta  -0.848646 -0.824503    1.000000,
               geo_delta  speed  time_delta  time_cluster
 geo_delta           1.0   -1.0         1.0           NaN
 speed              -1.0    1.0    

In [None]:
fig, ax = plt.subplots(1)
ax.plot(sorted(corrs), range(len(corrs)))