# Are users home locations in Zone A?

In [1]:
import pandas as pd; import numpy as np; from multiprocessing import Pool, Manager; import numpy as np; import geopandas as gpd
import matplotlib.pyplot as plt; import seaborn as sns
import matplotlib, os, json, sys, time

#### Import the ZoneA Geometry

In [2]:
import fiona, shapely; from osgeo import ogr; from shapely.geometry import mapping, shape
c = fiona.open('../ZoneA_Geometry/ZoneA/OGRGeoJSON.shp','r')
pol = c.next(); zoneA = shape(pol['geometry']).buffer(0); zoneA.is_valid

True

#### Import the user metadata DF

In [3]:
_user_meta = pd.read_json('../working_data/temporal_clustered_user_meta.json')

In [4]:
_user_meta = _user_meta.sort_index()
_user_meta.head(3)

Unnamed: 0,HomeCluster,tweets,user
0,1.0,14231,Andrewthemark
1,3.0,4934,frankieciv608
2,2.0,4566,NewYorkPuck


#### Import all of the individual user dataframes

In [5]:
user_names = sorted(os.listdir('../working_data/spatiotemporal_clustered_stage2/'))
len(user_names)

4859

In [6]:
#Define the working data directory... (These are the dataframes from the Temporal Clustering)
def loader_function(args):
    uFile, q = args
    u = json.load(open("../working_data/spatiotemporal_clustered_stage2/"+uFile,'r'))
    tweets = []
    for t in u['features']:
        t['properties']['geometry'] = shape(t['geometry'])
        t['properties']['date'] = pd.Timestamp(t['properties']['date'])
        tweets.append(t['properties'])
    q.put(1)
    return gpd.GeoDataFrame(tweets)

In [7]:
#Parallel runtime
p = Pool(30)
m = Manager()
q = m.Queue()

args = [(i, q) for i in user_names]
result = p.map_async(loader_function, args)

# monitor loop
while True:
    if result.ready():
        break
    else:
        size = q.qsize()
        with open("load.log",'w') as log:
            log.write("\rProcessed: {0}, {1:.3g}%\n".format(size, size/len(args)*100))
        sys.stderr.write("\rProcessed: {0}, {1:.3g}%".format(size, size/len(args)*100))
        time.sleep(2)

users = result.get()

Processed: 4859, 100%

In [8]:
users.sort( key=lambda x: len(x), reverse=True ) 

`users` is an array of user dataframes. Now find which users have _home locations_ in Zone A

## Calculate centroids of all user clusters

In [9]:
from shapely.geometry import MultiPoint

In [10]:
"""
    input :  User DataFrame
    output:  User DataFrame with aditional column "cluster_center";
"""
def calculate_cluster_centroids(userDF):
    df = userDF.copy();
    df.cluster = df.cluster.replace(np.nan,-1)
    centroids = df.groupby('cluster', as_index=False).aggregate({'geometry':lambda x: MultiPoint(list(x)).centroid})
    centroids.rename(columns={'geometry' : 'cluster_center'}, inplace=True)
    centroids.set_index(centroids.cluster, inplace=True)
    centroids.set_value(-1,'cluster_center',np.nan)
    return df.merge(centroids, on='cluster')

In [11]:
len_users = len(users)
_users_with_centroids = []
for idx, u in enumerate(users):
    _users_with_centroids.append( calculate_cluster_centroids(u) )
    sys.stderr.write("\r{0} of {1}".format(idx,len_users))

4858 of 4859

In [12]:
_users_with_centroids[100]

Unnamed: 0,cluster,coords,date,day_cluster,geo_delta,geometry,speed,text,time_delta,user,cluster_center
0,1,"[-74.01399389, 40.70330583]",2012-10-19 15:12:49+00:00,3,,POINT (-74.01399388999999 40.70330583),,@CaleOng 💋💋💋💋💋💋💋,,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
1,1,"[-74.0139728, 40.70336705]",2012-10-19 15:15:41+00:00,3,7.035672,POINT (-74.0139728 40.70336705),0.040905,“@48tweetsofpower: Crush your enemy totally.”,172.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
2,1,"[-74.0139389, 40.70337817]",2012-10-19 15:17:50+00:00,3,3.113689,POINT (-74.0139389 40.70337817),0.024137,"“@itsMichaelJ: If someone tells you, ""you can'...",129.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
3,1,"[-74.01414353, 40.7033991]",2012-10-19 15:20:20+00:00,3,17.405865,POINT (-74.01414353 40.7033991),0.116039,@labellagee83 I 👏can't 👏RT 👏your stuff👏 boo🙍,150.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
4,1,"[-74.01416371, 40.70338103]",2012-10-19 15:20:57+00:00,3,2.632680,POINT (-74.01416371000001 40.70338103),0.071154,Wow this month is FINISHO👏,37.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
5,1,"[-74.01401962, 40.70334224]",2012-10-19 15:22:21+00:00,3,12.889379,POINT (-74.01401962 40.70334224),0.153445,I need this year to go by now...I have shit to...,84.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
6,1,"[-74.01404023, 40.70334252]",2012-10-19 15:22:45+00:00,3,1.737629,POINT (-74.01404023000001 40.70334252),0.072401,I can use a glass of cold pink moscato,24.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
7,1,"[-74.0140187, 40.70329655]",2012-10-19 15:23:24+00:00,3,5.424264,POINT (-74.01401869999999 40.70329655),0.139084,&amp; Thy Shall Have...🙌,39.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
8,1,"[-74.01400036, 40.70329241]",2012-10-19 15:24:33+00:00,3,1.613081,POINT (-74.01400036 40.70329241),0.023378,@CaleOng youuuuuuuuu sometimes meeeeeeeee (ur ...,69.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)
9,1,"[-74.0139813, 40.70321576]",2012-10-19 15:30:57+00:00,3,8.673208,POINT (-74.0139813 40.70321576),0.022586,@CaleOng &amp; that's a FACT! Loll You know Im...,384.0,EffinwitESH,POINT (-74.01406915122905 40.70328859256985)


Now merge the `_user_meta` with the users center locations

In [13]:
_user_meta['hc_center'] = None

In [14]:
len_users = len(_users_with_centroids)
for idx,u in enumerate(_users_with_centroids):
    u_m = _user_meta.query("user=='{0}'".format(u.user[0]))
    idx = u_m.index.values[0]
#     print(u.user[0])
    hc = u_m.HomeCluster.values[0]
    if hc >0:
#         print(u_m)
        center = u.query('cluster=={0}'.format(hc)).cluster_center.values[0]
        _user_meta.iloc[idx,3] = center
    sys.stderr.write("\r{0} of {1}".format(idx, len_users))

4858 of 4859

In [15]:
_user_meta.head(2)

Unnamed: 0,HomeCluster,tweets,user,hc_center
0,1.0,14231,Andrewthemark,POINT (-73.69255456263605 40.67338539101333)
1,3.0,4934,frankieciv608,POINT (-73.81751579057131 40.82403772085716)


## Determine who's home cluster center is in Zone A

In [16]:
def insideZoneA(p):
    if p==None:
        return False
    else:
        return p.within(zoneA)

In [17]:
_user_meta['inZoneA'] = _user_meta.apply(lambda row: insideZoneA(row['hc_center']), axis=1)

In [18]:
_user_meta.query('inZoneA').head(3)

Unnamed: 0,HomeCluster,tweets,user,hc_center,inZoneA
4,2.0,3462,NZavaa,POINT (-73.80392010800064 40.81081586865448),True
15,2.0,2573,_an_oak_tree_,POINT (-74.00711654755784 40.74712723907435),True
42,2.0,1718,MyLuvisKING,POINT (-74.01307974015074 40.71548709588853),True


In [19]:
list(_user_meta.query('inZoneA').head(5).hc_center.apply(lambda x: json.dumps(mapping(x))))

['{"coordinates": [-73.80392010800064, 40.81081586865448], "type": "Point"}',
 '{"coordinates": [-74.00711654755784, 40.74712723907435], "type": "Point"}',
 '{"coordinates": [-74.01307974015074, 40.715487095888534], "type": "Point"}',
 '{"coordinates": [-73.79006214377567, 40.85626400344152], "type": "Point"}',
 '{"coordinates": [-74.0056795964912, 40.74782570175439], "type": "Point"}']

In [20]:
vulnerable_users = list(_user_meta.query('inZoneA').user)

In [None]:
os.mkdir('../working_data/geovulnerable_users')

In [214]:
from bson import json_util
for idx, u in enumerate(users):
    if u.user[0] in vulnerable_users:
        with open('../working_data/geovulnerable_users/'+u.user[0].lower()+'.geojson','w') as oFile:
            oFile.write(u.to_json(default=json_util.default))
    sys.stderr.write("\r{0} processed".format(idx))

4858 processed