# Turn a directory into something that can be easily visualized on a map!

In [1]:
import pandas as pd; import numpy as np; from multiprocessing import Pool, Manager; import numpy as np;
import fiona, shapely; from osgeo import ogr; from shapely.geometry import mapping, shape
import matplotlib.pyplot as plt; import seaborn as sns
import matplotlib, os, json, sys, time, datetime

In [11]:
input_directory  = "/data/chime/geo/sandy_new_jersey_geovulnerable_contextual_stage2"
output_directory = "/data/www/chime/movement-derivation/sandy_new_jersey_geo_vulnerable_contextual"

In [12]:
users_in = sorted(os.listdir(input_directory))
users_in = [x for x in users_in if x != "temporal_clustered_user_meta.json"]
print("Found {0} users in {1}".format(len(users_in), input_directory))

Found 329 users in /data/chime/geo/sandy_new_jersey_geovulnerable_contextual_stage2


In [22]:
def loader_function(args):
    user_geojsonl_file, input_directory, q = args
    tweets = []
    for line in open(input_directory+"/"+user_geojsonl_file,'r'):
        t = json.loads(line.strip())
        tweet = {'geometry':None}
        if t['geometry']:
            tweet['geometry'] = t['geometry']
        tweet['date'] = pd.Timestamp(t['properties']['postedTime'])
        tweet['text'] = t['properties']['body']
        tweet['user'] = t['properties']['actor']['preferredUsername']
        if 'cluster' in t['properties']:
            c = t['properties']['cluster']
        else:
            c = 0
        tweet['c'] = c

        if 'speed' in t['properties']:
            s = t['properties']['speed']
        else:
            s = 0
        tweet['s'] = s

        
        tweets.append(tweet)
    if q is not None:
        q.put(1)
    
    df = pd.DataFrame(tweets)
    df = df.sort_values(by='date')
    
    first_date = df['date'][0]
    last_date  = df['date'][len(df)-1]
    df['time'] = df['date'].apply(lambda x: datetime.datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ'))
    df['m']    = df['date'].apply(lambda x: int((x - first_date).total_seconds()/60))
    
    return df

In [23]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, input_directory, q) for i in users_in]
result = p.map_async(loader_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))
users = result.get()
p.close()

Processed: 0, 0%

KeyError: 'postedTime'

# Write `geojson` files for the web

In [8]:
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

In [9]:
def pad0(num):
    if num<10:
        return "0"+str(num)
    else:
        return str(num)

In [10]:
def write_contextual_stream_geojson(args):
    u, output_directory, q = args
    file = u['user'][0].lower()
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in u.iterrows():
        if pd.notnull(row.geometry):
            geom = row.geometry
        else:
            geom = None
        feature = {'type':'Feature',
                   'geometry':geom,
                   'properties':{'time':row['time'],
                                 'm':row['m'],
                                 'user':row['user'],
                                 'text':row['text'],
                                 'c':row['c']
                                 's':row['s']
                                }
            }
        geojson['features'].append(feature)
 
    
    with open(output_directory+"/"+file+".geojson",'w') as oFile:
        json.dump(geojson,oFile)
        
    if q is not None:
        q.put(1)
    return 1
#     start = "{0}-{1}-{2}T{3}:{4}".format(first_date.year,pad0(first_date.month),pad0(first_date.day),pad0(first_date.hour),pad0(first_date.minute))
#     end   = last_date.date()
#     print(file + "\t" +"http://www.localhost:4000/geojson-tweets?geojson=http://epic-analytics.cs.colorado.edu:9000/"+output_directory.replace('/data/www/','')+"/{0}.geojson&start={1}&end={2}&unit=minutes".format(file, start, end))

In [12]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, output_directory, q) for i in users]
result = p.map_async(write_contextual_stream_geojson, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))
result.get()
p.close()

Processed: 1197, 100%

## Deprecated
Use the below function iff using `.geojsonl` files