In [155]:
import psycopg2, multiprocessing, psycopg2.extras, os, json, sys, time, scipy, datetime
from multiprocessing import Pool, Manager 
from collections import Counter
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import shape
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Temporal Clustering

In [89]:
input_directory  = "/data/chime/geo/sandy_zone_a/clustered_with_speed_gdf"
output_directory = "/data/chime/geo/sandy_zone_a/stage_2_temporal_clusters_with_home"

_landfall_str = '201210300000'
_start_str    = '201210290000'
_end_str      = '201210310000'
_landfall = pd.Timestamp(_landfall_str)
_start    = pd.Timestamp(_start_str)
_end      = pd.Timestamp(_end_str)

## 0. Load all the geo-clustered tweets

In [90]:
users_in = sorted(os.listdir(input_directory))
print("Found {0} users in {1}".format(len(users_in), input_directory))

Found 4022 users in /data/chime/geo/sandy_zone_a/clustered_with_speed_gdf


In [92]:
def loader_function(args):
    uFile, path, q = args
    u = json.load(open(path+"/"+uFile,'r'))
    tweets = []
    for t in u['features']:
        t['properties']['geometry'] = shape(t['geometry'])
        t['properties']['date'] = pd.Timestamp(t['properties']['date'])
        tweets.append(t['properties'])
    q.put(1)
    return gpd.GeoDataFrame(tweets)

In [98]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, input_directory, q) for i in users_in[:200]]
result = p.map_async(loader_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))
users = result.get()
p.close()

Processed: 200, 100%

In [99]:
print("{0} total tweets for all users".format(sum([len(x) for x in users])))

4249 total tweets for all users


# Identifying Temporal Clusters
Use a custom _worker_ function to find specific time clusters

## 1. Enough Tweets?

Ensure that we have the following for each user:
1. Geo-Cluter Information (If no geo-clusters are available, remove)
2. Enough Tweets (At least *A* Tweet during the storm)

In [100]:
def time_cluster(t):
    t = t.tz_convert("EST")
    '''Get the timecluster'''
    hour = t.hour//4 + 1
    if t.weekday()>4:
        return 6+hour
    else:
        return hour
    
def worker_function(args):
    userDF, q = args
    
    #If no tweets around the time of the storm, then fail.
    if len(userDF.query("date > %s & date < %s"%(_start_str, _end_str))) < 1:
        q.put(1)
        return None
    
    userDF['day_cluster'] = userDF.date.apply(lambda t: time_cluster(t))
    q.put(0)
    return userDF

In [101]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, q) for i in users]
result = p.map_async(worker_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))

values = result.get()
x = [i for i in values  if i is not None]
nones = [i for i in values  if i is None]
p.close()

Processed: 200, 100%

In [102]:
print("Successfully processed {0} users\n{1} Users failed".format(len(x),len(nones)))

Successfully processed 15 users
185 Users failed


In [103]:
_users = sorted(x, key=lambda y: len(y), reverse=True)

## 2. Group the clusters and find which geo-clusters correspond with home hours

In [136]:
def rank_clusters(df):
    """
    There is definitely room for the logic in _this_ function to improve, but for now it looks good :) 
    """
    gb_geo = df.query('date < '+_start_str).groupby('cluster')
    if len(gb_geo) < 1:
        return (None,None)
    _agged = gb_geo['day_cluster'].agg({"tweets":pd.Series.count,
                                        "Number Unique Times":pd.Series.nunique,
                                        "day_cluster_counts": lambda t: Counter(t),
                                        "HomeTimes": lambda t: any(t==1) or any(t==2) or any(t==6),
                                      }).sort_values('Number Unique Times', ascending=False)
    hc = None
#   If the highest rated cluster (unique timewise includes HomeTimes, then return that)
    if(_agged.HomeTimes.values[0]):
        hc = _agged.iloc[0].name
    return hc, _agged

In [151]:
vals= []
len_users = len(_users)
user_collection = []

user_meta_collection = []
for idx, U in enumerate(_users):
    
    hc = rank_clusters(U)[0]
    if hc is not None and hc < 0:
        hc = None
        
    user_meta_collection.append({
            'user':U['user'].values[0],
            'uid' :U['uid'].values[0],
            'tweets':len(U),
            'home_cluster': hc
        })

    U['home_cluster_id'] = pd.Series(hc)
    
    sys.stderr.write("\r{0} of {1}".format(idx+1, len_users))
_user_meta = pd.DataFrame(user_meta_collection)
sys.stderr.write("\n\nClustered: {0}, Failed: {1}".format( len(_user_meta[~np.isnan(_user_meta.home_cluster)]),
                                                           len(_user_meta[np.isnan(_user_meta.home_cluster)])))

1 of 152 of 153 of 154 of 155 of 156 of 157 of 158 of 159 of 1510 of 1511 of 1512 of 1513 of 1514 of 1515 of 15

Clustered: 3, Failed: 12

In [167]:
users_with_homes = [u for u in _users if u.home_cluster_id.count()>0]

Export these results

In [168]:
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

In [169]:
#Export the User Meta Dataframe first
with open(output_directory+'/'+'temporal_clustered_user_meta.json','w') as metaOut:
    metaOut.write(_user_meta.to_json())

## 3.  Write these users to disk

In [170]:
def safe_json_export(args):
    df, path, q = args
    df = df.copy()
    df['date'] = df['date'].apply(lambda t: datetime.datetime.strftime(t,'%Y-%m-%dT%H:%M:%SZ'))
    uName = df.head(1).user.values[0].lower()
    with open(path+"/"+uName+'.geojson','w') as oFile:
        oFile.write(df.to_json(ensure_ascii=False))
    if q is not None:
        q.put(1)

In [172]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, output_directory, q) for i in users_with_homes]
result = p.map_async(safe_json_export, args)

sys.stderr.write("Exporting {0} users to {1}\n".format(len(args),output_directory))

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
p.close()
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))

Exporting 3 users to /data/chime/geo/sandy_zone_a/stage_2_temporal_clusters_with_home
Processed: 3, 100%

<hr>
<hr>
# End of Processing 

<hr><hr>
# Beginning of Visual Inspection