# DBScan Clustering

1. Using DBScan, find spatial clusters relevant to each user, typically home, work, or school.
1. Also identify the speeds between each geo-tagged tweet.

**Input**: Directory of Twitterers (in `geojsonl` format)<br>
**Output**: Directory of Twitterers in a GeoPandas GeoDataFrame (written to JSON)

In [1]:
# input_directory  = "/data/chime/geo/matthew/brevard_zone_a_users/"
# output_directory = "/data/chime/geo/matthew/brevard_zone_a_clustered_with_speed_gdf"

In [2]:
input_directory  = "/data/chime/geo/zone_a_users_full_contextual"
output_directory = "/data/chime/geo/zone_a_full_contexual/stage1"

Should be safe to "run all" if the above directories are set :) 

In [3]:
import os, json, matplotlib, iso8601, sys, time, datetime, pytz
import numpy as np; import pandas as pd; import geopandas as gpd
from shapely.geometry import shape, mapping, MultiPoint
from multiprocessing import Pool, Manager;
from dbscan_python import dbscan
from geodistance import geodistance
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
users_in = sorted(os.listdir(input_directory))
print("Found {0} users in {1}".format(len(users_in), input_directory))

Found 1195 users in /data/chime/geo/zone_a_users_full_contextual


In [5]:
# # #sample?
# users_in = np.random.choice(users_in, 100, replace=False)
# len(users_in)

# 1. Read Users In... and process

In [6]:
def worker_function(args):
    """
    Input: User .geojsonl file
    Returns: DataFrame with speed & clusters

    #Process
    1. Loads all of a user's tweets (parseable json per line, parse, strip down to details, sort by time)
    2. Computes Clusters based on below EPS & MIN_PTS
    3. Computes time, distance, and speed between each tweet
    4. Puts it all into a DataFrame

    """
    #Unpack the arguments
    user_geojsonl_file, input_directory, q = args

    EPS     = 100     #Max. Distance for points in the cluster... (In meters)                                                                                                                  
    MIN_PTS = 3       #Minimum Points in a cluster...
    
    geo_tweets     = []
    non_geo_tweets = []
    for line in open(input_directory+"/"+user_geojsonl_file,'r'):
        t = json.loads(line.strip())
        if t['geometry']:
            stripped = {
                    'geometry': shape(t['geometry']),
                    'coords'  : t['geometry']['coordinates'],
                    'date'    : pd.Timestamp(t['properties']['postedTime']),
                    'text'    : t['properties']['body'],
                    'user'    : t['properties']['actor']['preferredUsername'],
                    'uid'     : t['properties']['actor']['id'].split(":")[2],
                    'tweet_id': t['properties']['id'].split(":")[2]
                }
            geo_tweets.append(stripped)
        else:
            t = {
                    'date'    : pd.Timestamp(t['properties']['postedTime']),
                    'text'    : t['properties']['body'],
                    'user'    : t['properties']['actor']['preferredUsername'],
                    'uid'     : t['properties']['actor']['id'].split(":")[2],
                    'tweet_id': t['properties']['id'].split(":")[2]
                }
            non_geo_tweets.append(t)

    #Exit case 1: There aren't enough points
    if len(geo_tweets)<MIN_PTS:
        q.put(1)
        return None

    #Ensure we're sorted by time (Safety measure, probably taking performance hit)
    geo_tweets.sort(key=lambda t: t['date'])

    #Clustering:
    points = [t['coords'] for t in geo_tweets]
    clusters = dbscan.dbscan(np.matrix([ [p[0] for p in points], [p[1] for p in points] ]), EPS, MIN_PTS)
    
    #Iterate through tweets & clusters to assign cluster & calcualte distances
    for idx, t in enumerate(geo_tweets):
        t['cluster'] = clusters[idx]
        if idx>0:
            p1 = t['geometry']
            p2 = geo_tweets[idx-1]['geometry']
            t['geo_delta'] = geodistance.distanceHaversine(p1.y,p1.x,p2.y,p2.x)[0]*1000
    
    #Now we can put all the tweets into a DataFrame!
    
    df = gpd.GeoDataFrame(geo_tweets+non_geo_tweets)

    # If clusters were all -1, then return nothing, it couldn't cluster!
    if not len(df.query('cluster>=0'))>=1:
        q.put(2)
        return None
    df['time_delta'] = df['date'].diff()
    df['speed'] = df.apply(lambda row: row['geo_delta']/ (row['time_delta'] / np.timedelta64(1, 's')), axis=1)
    q.put(0)
    return df.sort_values(by='date').reset_index(drop=True)

In [None]:
# Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i,input_directory,q) for i in users_in]
result = p.map_async(worker_function, args)

sys.stderr.write("Processing {0} users...\n".format(len(users_in)))

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(1)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))

values = result.get()
users = [i for i in values if i is not None]
nones = [i for i in values if i is None]
p.close()

Processing 1195 users...
Processed: 1193, 99.8%Process ForkPoolWorker-2:
Process ForkPoolWorker-14:
Process ForkPoolWorker-26:
Process ForkPoolWorker-4:
Process ForkPoolWorker-10:
Process ForkPoolWorker-5:
Process ForkPoolWorker-13:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Process ForkPoolWorker-1:
Process ForkPoolWorker-19:
Process ForkPoolWorker-18:
Process ForkPoolWorker-21:
Process ForkPoolWorker-30:
Process ForkPoolWorker-22:
Process ForkPoolWorker-25:
Process ForkPoolWorker-3:
Process ForkPoolWorker-7:
Process ForkPoolWorker-17:
Process ForkPoolWorker-15:
Process ForkPoolWorker-9:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most rece

KeyboardInterrupt: 

Process ForkPoolWorker-33:
Process ForkPoolWorker-32:
Process ForkPoolWorker-35:
Process ForkPoolWorker-34:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/p

In [None]:
print("Successfully processed {0} users\n{1} Users failed".format(len(users),len(nones)))

In [None]:
def calculate_cluster_centroids(args):
    df, q = args
    centroids = df.groupby('cluster', as_index=False).aggregate({'geometry':lambda x: MultiPoint(list(x)).centroid})
    centroids.rename(columns={'geometry' : 'cluster_center'}, inplace=True)
    centroids.set_index(centroids.cluster, inplace=True)
    centroids.set_value(-1,'cluster_center',None)
    if q is not None:
        q.put(1)
    return df.merge(centroids, on='cluster', how='left')

In [None]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i,q) for i in users]
result = p.map_async(calculate_cluster_centroids, args)

sys.stderr.write("Processing {0} users...\n".format(len(users)))

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        with open("load.log",'w') as log:
            log.write("\rProcessed: {0}, {1:.3g}%\n".format(size, size/len(args)*100))
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(1)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))

_users = result.get()
p.close()

# 2. Write these users to Disk

In [None]:
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [None]:
def safe_mapping(p):
    if p==None or np.isnan(p).any():
        return None
    else:
        return mapping(p)

In [None]:
def safe_json_export(args):
    df, path, q = args
    df = df.copy() #Safety measure for atomic conversions...
    df['date'] = df['date'].apply(lambda t: datetime.datetime.strftime(t,'%Y-%m-%dT%H:%M:%SZ'))
    df['time_delta'] = df['time_delta'] / np.timedelta64(1, 's')
    df['cluster_center'] = df['cluster_center'].apply(lambda c: json.dumps(safe_mapping(c)))
    uName = df.head(1).user.values[0].lower() # Grab username, always make it lowercase for sorting safety :) 
    
    clean = df.where((pd.notnull(df)), None)
    geojson = {"type":"FeatureCollection","features":[]}
    for _, row in clean.iterrows():
        geom = safe_mapping(row.geometry)
        feature = {'type':'Feature',
                   'geometry':geom,
                   'properties':row.to_dict()
                    }
        del feature['properties']['geometry']
        geojson['features'].append(feature)
    
    with open(path+"/"+uName+'.geojson','w') as oFile:
        json.dump(geojson, oFile) 

    if q is not None:
        q.put(1)

In [None]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, output_directory, q) for i in _users]
result = p.map_async(safe_json_export, args)

sys.stderr.write("Exporting {0} users to {1}\n".format(len(_users),output_directory))

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
p.close()
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))