# Turn a directory into something that can be easily visualized on a map!

In [16]:
import pandas as pd; import numpy as np; from multiprocessing import Pool, Manager; import numpy as np;
import fiona, shapely; from osgeo import ogr; from shapely.geometry import mapping, shape
import matplotlib.pyplot as plt; import seaborn as sns
import matplotlib, os, json, sys, time, datetime

In [17]:
input_directory  = "/data/chime/geo/zone_a_full_contexual/stage2/"
output_directory = "/data/www/chime/movement-derivation/sandy_zone_a_contextual/"

In [18]:
users_in = sorted(os.listdir(input_directory))
users_in = [x for x in users_in if x != "temporal_clustered_user_meta.json"]
print("Found {0} users in {1}".format(len(users_in), input_directory))

Found 1188 users in /data/chime/geo/zone_a_full_contexual/stage2/


In [19]:
def loader_function(args):
    uFile, path, q = args
    u = json.load(open(path+"/"+uFile,'r'))
    tweets = []
    for t in u['features']:
        if t['geometry']:
            t['properties']['geometry'] = shape(t['geometry'])
        t['properties']['date'] = pd.Timestamp(t['properties']['date'])
        tweets.append(t['properties'])
    
    df = pd.DataFrame(tweets)
    
    df = df.sort_values(by='date')
    
    first_date = df['date'][0]
    
    df['time'] = df['date'].apply(lambda x: datetime.datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ'))
    df['h']    = df['date'].apply(lambda x: int((x - first_date).total_seconds()/3600))
    df['m']    = df['date'].apply(lambda x: int((x - first_date).total_seconds()/60))
    if not q is None:
        q.put(1)
    return df

In [20]:
#debug
loader_function((users_in[0], input_directory, None))

Unnamed: 0,cluster,cluster_center,coords,date,day_cluster,geo_delta,geometry,home_cluster_id,speed,text,time_delta,tweet_id,uid,user,time,h,m
0,,,,2012-09-01 04:13:15+00:00,6,,,2.0,,“@huny: michael jackson dance pordy (@ Le Pois...,-7861391.0,241750544262242304,17418108,11Zette17,2012-09-01T04:13:15Z,0,0
1,,,,2012-09-01 04:14:03+00:00,6,,,,,Butterflies.,48.0,241750745878241280,17418108,11Zette17,2012-09-01T04:14:03Z,0,0
2,,,,2012-09-01 04:20:48+00:00,6,,,,,#spotted @and_wayne!,405.0,241752442436452353,17418108,11Zette17,2012-09-01T04:20:48Z,0,7
3,,,,2012-09-01 04:40:23+00:00,6,,,,,Spotted the big sis @dj_diva + cousin @Carnegro,1175.0,241757371699978240,17418108,11Zette17,2012-09-01T04:40:23Z,0,27
4,,,,2012-09-01 04:45:50+00:00,6,,,,,Enjoy Yourself.,327.0,241758742423674880,17418108,11Zette17,2012-09-01T04:45:50Z,0,32
5,,,,2012-09-01 04:53:45+00:00,6,,,,,Finally spotted my @huny!,475.0,241760734059573248,17418108,11Zette17,2012-09-01T04:53:45Z,0,40
6,,,,2012-09-01 05:08:03+00:00,7,,,,,Baby Be Mine. #favoriteMJJsong,858.0,241764334584356864,17418108,11Zette17,2012-09-01T05:08:03Z,0,54
7,1.0,"{""type"": ""Point"", ""coordinates"": [-74.00054383...","[-74.00054383, 40.72988293]",2012-09-01 05:22:29+00:00,7,,POINT (-74.00054383 40.72988293),,,The fact they are playing Cash Money's Greates...,,241767964918034432,17418108,11Zette17,2012-09-01T05:22:29Z,1,69
8,,,,2012-09-01 06:34:42+00:00,7,,,,,I could dance to Michael Jackson all night...h...,5199.0,241786141005725696,17418108,11Zette17,2012-09-01T06:34:42Z,2,141
9,-1.0,,"[-73.9999298, 40.72852827]",2012-09-01 06:35:30+00:00,7,159.269799,POINT (-73.9999298 40.72852827),,0.036355,Unified for Heal the World @ Le Poisson Rouge...,4381.0,241786342252634112,17418108,11Zette17,2012-09-01T06:35:30Z,2,142


In [21]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, input_directory, q) for i in users_in]
result = p.map_async(loader_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))
users = result.get()
p.close()

Processed: 1188, 100%

In [22]:
users[0].tail()

Unnamed: 0,cluster,cluster_center,coords,date,day_cluster,geo_delta,geometry,home_cluster_id,speed,text,time_delta,tweet_id,uid,user,time,h,m
5700,,,,2012-12-01 02:28:43+00:00,6,,,,,Yellow yachts...,1762.0,274701529838141440,17418108,11Zette17,2012-12-01T02:28:43Z,2182,130935
5701,,,,2012-12-01 02:30:48+00:00,6,,,,,Swag Surfin in Manhattan,125.0,274702054113542144,17418108,11Zette17,2012-12-01T02:30:48Z,2182,130937
5702,,,,2012-12-01 03:10:21+00:00,6,,,,,Dirty South Good Time w/ deerene_lcc + @tdotwh...,2373.0,274712009151492099,17418108,11Zette17,2012-12-01T03:10:21Z,2182,130977
5703,,,,2012-12-01 03:36:14+00:00,6,,,,,...I'm a Southern Gurl... http://t.co/HXT3qlcX,1553.0,274718523564105730,17418108,11Zette17,2012-12-01T03:36:14Z,2183,131002
5704,-1.0,,"[-73.997361, 40.737604]",2012-12-01 03:56:26+00:00,6,532.815569,POINT (-73.997361 40.737604),,0.044184,"Why are her sliders this big tho.., @ Good Stu...",12059.0,274723604321669120,17418108,11Zette17,2012-12-01T03:56:26Z,2183,131023


# Write `geojson` files for the web

In [23]:
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

In [24]:
def write_geojson(args):
    u, output_directory, q = args
    file = u['user'][0].lower()
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in u.where(pd.notnull(u), None).iterrows():
        if pd.notnull(row.geometry):
            geom = mapping(row.geometry)
        else:
            geom = None
        feature = {'type':'Feature',
                   'geometry':geom,
                   'properties':{'time':row['time'],
                                 'h':row['h'],
                                 'm':row['m'],
                                 'user':row['user'],
                                 'text':row['text'],
                                 's':row['speed'],
                                 'c':row['cluster']
                                 }
            }
        geojson['features'].append(feature)
    
    with open(output_directory+"/"+file+".geojson",'w') as oFile:
        json.dump(geojson,oFile)
        
    if q is not None:
        q.put(1)
    return 1

In [25]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, output_directory, q) for i in users]
result = p.map_async(write_geojson, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(0.5)
sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(q.qsize(), q.qsize()/len(args)*100))
result.get()
p.close()

Processed: 1188, 100%