# DBScan Clustering

1. Using DBScan, find spatial clusters relevant to each user, typically home, work, or school.
1. Also identify the speeds between each geo-tagged tweet.

**Input**: Directory of Twitterers (in `geojsonl` format)<br>
**Output**: Directory of Twitterers in a GeoPandas GeoDataFrame (written to JSON)

### This notebook runs the scripts in ONE worker function, if it hangs, there are still results

In [1]:
# input_directory  = "/data/chime/geo/matthew/brevard_zone_a_users/"
# output_directory = "/data/chime/geo/matthew/brevard_zone_a_clustered_with_speed_gdf"

In [3]:
input_directory  = "/data/chime/geo2/NJ/AtlanticCity/"
output_directory = "/data/chime/geo2/PROCESSED/NJ/AtlanticCity_Stage1"

Should be safe to "run all" if the above directories are set :) 

In [4]:
import os, json, matplotlib, iso8601, sys, time, datetime, pytz
import numpy as np; import pandas as pd; import geopandas as gpd
from shapely.geometry import shape, mapping, MultiPoint
from multiprocessing import Pool, Manager;
from dbscan_python import dbscan
from geodistance import geodistance
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
users_in = sorted(os.listdir(input_directory))
print("Found {0} users in {1}".format(len(users_in), input_directory))

Found 1697 users in /data/chime/geo2/NJ/AtlanticCity/


In [6]:
# # #sample?
# users_in = np.random.choice(users_in, 100, replace=False)
# len(users_in)

# Major Worker Function

In [7]:
def worker_function(args):
    """
    Input: User .geojsonl file
    Returns: DataFrame with speed & clusters

    #Process
    1. Loads all of a user's tweets (parseable json per line, parse, strip down to details, sort by time)
    2. Computes Clusters based on below EPS & MIN_PTS
    3. Computes time, distance, and speed between each tweet
    4. Puts it all into a DataFrame
    5. Finds Cluster Centroids
    6. Writes files to disk

    """
    #Unpack the arguments
    user_geojsonl_file, input_directory, output_directory, q = args

    EPS     = 200     #Max. Distance for points in the cluster... (In meters)                                                                                                                  
    MIN_PTS = 5       #Minimum Points in a cluster...
    
    geo_tweets     = []
    non_geo_tweets = []
    for line in open(input_directory+"/"+user_geojsonl_file,'r'):
        t = json.loads(line.strip())
        if t['geometry']:
            stripped = {
                    'geometry': shape(t['geometry']),
                    'coords'  : t['geometry']['coordinates'],
                    'date'    : pd.Timestamp(t['properties']['postedTime']),
                    'text'    : t['properties']['body'],
                    'user'    : t['properties']['actor']['preferredUsername'],
                    'uid'     : t['properties']['actor']['id'].split(":")[2],
                    'tweet_id': t['properties']['id'].split(":")[2]
                }
            geo_tweets.append(stripped)
        else:
            t = {
                    'date'    : pd.Timestamp(t['properties']['postedTime']),
                    'text'    : t['properties']['body'],
                    'user'    : t['properties']['actor']['preferredUsername'],
                    'uid'     : t['properties']['actor']['id'].split(":")[2],
                    'tweet_id': t['properties']['id'].split(":")[2]
                }
            non_geo_tweets.append(t)

    #Exit case 1: There aren't enough points
    if len(geo_tweets)<MIN_PTS:
        q.put(1)
        return None

    #Ensure we're sorted by time (Safety measure, probably taking performance hit)
    geo_tweets.sort(key=lambda t: t['date'])

    #Clustering:
    points = [t['coords'] for t in geo_tweets]
    clusters = dbscan.dbscan(np.matrix([ [p[0] for p in points], [p[1] for p in points] ]), EPS, MIN_PTS)
    
    #Iterate through tweets & clusters to assign cluster & calcualte distances
    for idx, t in enumerate(geo_tweets):
        t['cluster'] = clusters[idx]
        if idx>0:
            p1 = t['geometry']
            p2 = geo_tweets[idx-1]['geometry']
            t['geo_delta'] = geodistance.distanceHaversine(p1.y,p1.x,p2.y,p2.x)[0]*1000
    
    #Now we can put all the tweets into a DataFrame!
    
    df = gpd.GeoDataFrame(geo_tweets+non_geo_tweets)

    # If clusters were all -1, then return nothing, it couldn't cluster!
    if not len(df.query('cluster>=0'))>=1:
        q.put(2)
        return None
    df['time_delta'] = df['date'].diff()
    df['speed'] = df.apply(lambda row: row['geo_delta']/ (row['time_delta'] / np.timedelta64(1, 's')), axis=1)
    df = df.sort_values(by='date').reset_index(drop=True)
    
    #Calculate Cluster Centroids
    centroids = df.groupby('cluster', as_index=False).aggregate({'geometry':lambda x: MultiPoint(list(x)).centroid})
    centroids.rename(columns={'geometry' : 'cluster_center'}, inplace=True)
    centroids.set_index(centroids.cluster, inplace=True)
    centroids.set_value(-1,'cluster_center',None)
   
    df = df.merge(centroids, on='cluster', how='left')
    
    #Now write it out
    df['date'] = df['date'].apply(lambda t: datetime.datetime.strftime(t,'%Y-%m-%dT%H:%M:%SZ'))
    df['time_delta'] = df['time_delta'] / np.timedelta64(1, 's')
    df['cluster_center'] = df['cluster_center'].apply(lambda c: json.dumps(safe_mapping(c)))
    uName = df.head(1).user.values[0].lower() # Grab username, always make it lowercase for sorting safety :) 
    
    clean = df.where((pd.notnull(df)), None)
    geojson = {"type":"FeatureCollection","features":[]}
    for _, row in clean.iterrows():
        geom = safe_mapping(row.geometry)
        feature = {'type':'Feature',
                   'geometry':geom,
                   'properties':row.to_dict()
                    }
        del feature['properties']['geometry']
        geojson['features'].append(feature)
    
    with open(output_directory+"/"+uName+'.geojson','w') as oFile:
        json.dump(geojson, oFile) 

    if q is not None:
        q.put(1)
        
    return df

In [8]:
def safe_mapping(p):
    if p==None or np.isnan(p).any():
        return None
    else:
        return mapping(p)

In [9]:
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Run the super function

In [None]:
# Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i,input_directory,output_directory,q) for i in users_in]
result = p.map_async(worker_function, args)

sys.stderr.write("Processing {0} users...\n".format(len(users_in)))

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(1)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))

values = result.get()
users = [i for i in values if i is not None]
nones = [i for i in values if i is None]
p.close()

Processing 1697 users...
Processed: 1536, 90.5%

In [None]:
len(nones)

In [None]:
len(users)