In [1]:
import json, iso8601, fiona, pprint

# Load, Clean, Sort, Export from GNIP

This notebook: 

1. Loads raw gnip data
1. Filters for a geotag
1. Identifies all tweets within an area of interest
1. Finds Unique Users
1. Writes full GNIP GEOJSONL files per user for users with at least 3 tweets in the area of interest

## [Click here for Spark Status](http://epic-analytics.cs.colorado.edu:4040/jobs/)

In [2]:
#Load the files
raw_strings = sc.textFile('/data/chime/matthew/gnip-geo/ws*/*')

#Filter out the duds
strings = raw_strings.filter(lambda x: x!="")

#JSONs
jsons  = strings.map(json.loads)

tweet_jsons = jsons.filter(lambda x: 'info' not in x.keys())

## Step 1. Load all the Tweets!
### 2 Crucial Steps
1. A lot of tweets do not actually have lat/lon that will work for our purposes
1. The GNIP `geo` field is backwards from convention

In [3]:
def fix_geo(t):
    t['geo']['coordinates'].reverse()
    return t

tweets_with_geo = tweet_jsons.filter(lambda t: 'geo' in t.keys())

geo_tweets = tweets_with_geo.map(fix_geo)

### Check that this is working so far

In [4]:
pprint.pprint(geo_tweets.take(1)[0])

{u'actor': {u'displayName': u'DJDADDY',
            u'favoritesCount': 6131,
            u'followersCount': 1016,
            u'friendsCount': 350,
            u'id': u'id:twitter.com:1097299662',
            u'image': u'https://pbs.twimg.com/profile_images/733006233481928704/w7EXSmBe_normal.jpg',
            u'languages': [u'en'],
            u'link': u'http://www.twitter.com/DJDADDY_PRO',
            u'links': [{u'href': u'http://www.djdaddypro.com',
                        u'rel': u'me'}],
            u'listedCount': 11,
            u'location': {u'displayName': u'Hollywood, FL',
                          u'objectType': u'place'},
            u'objectType': u'person',
            u'postedTime': u'2013-01-17T05:47:27.000Z',
            u'preferredUsername': u'DJDADDY_PRO',
            u'statusesCount': 5875,
            u'summary': u'Calienta El Party Con Dj Daddy\u270c\ufe0f',
            u'twitterTimeZone': u'Atlantic Time (Canada)',
            u'utcOffset': u'-10800',
           

## Optional: Export subsets of the data as `.geojsonl` files
This is not by user, this is all tweets in one file (i.e., BIG)

## Step 2: Group tweets by user

In [5]:
tweets_gb_user = geo_tweets.groupBy(lambda t: t['actor']['id'])
tweets_gb_user.cache() # Cache here because we'll use this over and over and over....
tweets_gb_user.count() # If you don't call this, it won't actually cache anything :) 

120108

#### Check on the status of this operation, should see a tuple of: `(user_id, iterable)`

In [32]:
pprint.pprint(tweets_gb_user.take(1)[0])

(u'id:twitter.com:45516704',
 <pyspark.resultiterable.ResultIterable object at 0x7f01db6a0890>)


In [33]:
def write_out_simplified_geo_contextual(tuple_of_uid_tweets):
    u_tweets = list(tuple_of_uid_tweets[1])
    u_tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
    fileName = u_tweets[0]['actor']['preferredUsername'].lower()
    write_simplified_tweets_to_geojsonl('../working_data/simplified_geo_contextual_all_users/'+fileName,u_tweets)

### Optional: Write the `geo-tweet-streams` for EVERY user 

## Step 3: Geographic Filtering

Load the shapefile for Evacuation Zone (or whatever your bounds should be)

Currently this is primed for Hurricane Sandy, but for Matthew, just add the FL Evac Zone data

In [34]:
from shapely.geometry import mapping, shape
import fiona

In [35]:
GEOVULNERABLE_AREAS = [
    {"name": "FL_Brevard_ZoneA",           
     "outDir": "/data/chime/geo2/FL/brevard_zone_a",          
     "inFile": "../EvacuationZones/Florida/brevard_zone_a_hull.geojson"},
    {"name": "FL_DuvallCounty",           
     "outDir": "/data/chime/geo2/FL/duvall_county",          
     "inFile": "../EvacuationZones/Florida/DuvallCountyPlus_Hull.geojson"},
    {"name": "FL_Indian_Martin_Lucie_Counties",           
     "outDir": "/data/chime/geo2/FL/indian_martin_lucie_counties",          
     "inFile": "../EvacuationZones/Florida/martin_indian_lucie_hull.geojson"},
    {"name": "FL_Martin_Palm_Beach_Counties",           
     "outDir": "/data/chime/geo2/FL/martin_palm_beach_counties",          
     "inFile": "../EvacuationZones/Florida/martin_palm_beach_hull.geojson"},
    {"name": "FL_Martin_Palm_Beach_Counties_Inland",           
     "outDir": "/data/chime/geo2/FL/martin_palm_beach_counties_inland",          
     "inFile": "../EvacuationZones/Florida/martin_palm_beach_inland_hull.geojson"},
    {"name": "FL_Volusia_County",           
     "outDir": "/data/chime/geo2/FL/volusia_county",          
     "inFile": "../EvacuationZones/Florida/VolusiaPlus_Hull.geojson"}
]

In [47]:
## For GeoJSON Files

for zone in GEOVULNERABLE_AREAS:
    print(zone['name'])
    zone_geojson = json.load(open(zone['inFile'],'r'));
    zone['geom'] = shape(zone_geojson['geometry'])
    print(zone['geom'].area)
    print(zone['geom'].is_valid)
    with open("/data/www/jennings/geovulnerable_geoms/"+zone['name']+".geojson",'w') as o:
        json.dump(mapping(zone['geom']),o)

FL_Brevard_ZoneA
0.176479218337
True
FL_DuvallCounty
0.543849526144
True
FL_Indian_Martin_Lucie_Counties
0.102402136596
True
FL_Martin_Palm_Beach_Counties
0.0590595896808
True
FL_Martin_Palm_Beach_Counties_Inland
0.145108061803
True
FL_Volusia_County
0.115224000409
True


In [52]:
zonesBroadcast = sc.broadcast(GEOVULNERABLE_AREAS)

### Iterate over each of the grouped by users and determine which zones they may or may not belong in...

In [53]:
def checkZoneWriteUser(points, tweets, zone, outDir):
    """
    Given a list of points, tweets, the zone, and an output directory, check if at least 3 tweets land in the zone.
    If so, write the user to disk and return.
    """
    inBounds = 0;
    for p in points:
        if zone.contains(p):
            inBounds += 1
            if inBounds > 2:
                write_full_tweets_to_geojsonl(outDir+'/'+tweets[0]['actor']['preferredUsername'].lower(),tweets)
                return

In [54]:
def processZone(userIterable):
    #Iterate over all of a user's tweets
    zoneCounter = [{"geom":z['geom'], "dir":z['outDir']} for z in zonesBroadcast.value]
    t_points = [shape(t['geo']) for t in userIterable[1]]
    u_tweets = list(userIterable[1])
    fileName = u_tweets[0]['actor']['preferredUsername'].lower()

    for zone in zoneCounter:
        checkZoneWriteUser(t_points, u_tweets, zone['geom'], zone['dir'])

### Boom! Run it!

In [56]:
tweets_gb_user.foreach(processZone)

## Available Export Functions (Requires Local Arrays)

> Be sure to run this section to enable export functions above

In [44]:
def write_full_tweets_to_geojsonl(fileName, tweets):
    with open(fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':t #Full GNIP Tweet in the properties (BIG)
            }
            outFile.write(json.dumps(geojson)+"\n")

In [45]:
def write_simplified_tweets_to_geojsonl(fileName, tweets):
    with open('../working_data/'+fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            if 'location' in t:
                loc = t['location']
            else:
                loc = None
            if 'location' in t['actor']:
                u_loc = t['actor']['location']
            else:
                u_loc = None
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'uid' :t['actor']['id'],
                    'u_loc':u_loc,
                    'u_reg':t['actor']['postedTime'],
                    'u_sum':t['actor']['summary'],
                    'tid' :t['id'],
                    'loc' :loc,
                    'time':t['postedTime'],
                    'text':t['body'],
                    'source':t['generator'],
                    'verb':t['verb'],
                    'meta':t['twitter_entities'],
                    'u_utc':t['actor']['utcOffset']
                }
            }
            outFile.write(json.dumps(geojson)+"\n")

In [46]:
def write_bare_tweets_to_geojsonl(fileName, tweets):
    with open('../working_data/'+fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'time':t['postedTime'],
                    'text':t['body']
                }
            }
            outFile.write(json.dumps(geojson)+"\n")

# Deprecated...

In [65]:
#Set as a broadcast variable for spark
zonesBroadcast = sc.broadcast(GEOVULNERABLE_AREAS)
# #Testing (For ZoneA):
# in_zone_a = shape({"type": "Point", "coordinates": [-73.75336647033691,40.599095409829815]})
# in_bounds = shape({'type': "Point", 'coordinates': [-73.99154663085938,40.361195540839]})
# print in_bounds
# print "TRUE?", zoneBroadcast.value.contains(in_zone_a)
# print "TRUE?", in_zone_a.within(zoneBroadcast.value)

# out_of_bounds = shape({"type": "Point","coordinates": [-73.99753114562988,40.73093368341445]})
# print out_of_bounds
# print "FALSE?", zoneBroadcast.value.contains(out_of_bounds)

In [None]:
#Filter for those tweets in ZoneA
inZoneA = tweets_with_geo.filter(lambda t: zoneABroadcast.value.contains( shape(t['geo']) ) )

In [None]:
#Group by user (for inside of Zone A)
inZoneA_gb_user = inZoneA.groupBy(lambda t: t['actor']['id']).collect()

In [None]:
users_with_at_least_one_tweet_in_zoneA = [u[0] for u in inZoneA_gb_user]
len(users_with_at_least_one_tweet_in_zoneA)

In [None]:
# json.dump(users_with_at_least_one_tweet_in_zoneA, open('../working_data/users_with_at_least_one_tweet_in_zoneA.json','wb'))

### Write out the ZoneA tweets, but first, ensure it's sorted by time

In [None]:
print(tweets[0]['postedTime'])
iso8601.parse_date(tweets[0]['postedTime'])

In [None]:
for (user, tweets) in inZoneA_gb_user:
    tweets = [t for t in tweets]
    userName = tweets[0]['actor']['preferredUsername'].lower()
    write_full_tweets_to_geojsonl('../working_data/tweets_in_zone_a_by_user/'+userName, tweets)

### Get all Tweets from these Users (Beyond just those in ZoneA)

In [45]:
users_with_one_tweet_in_zoneA = json.load(open('../working_data/users_with_at_least_one_tweet_in_zoneA.json'))
len(users_with_one_tweet_in_zoneA)

21951

In [45]:
def write_out_simplified_geo_contextual(tuple_of_uid_tweets):
    if tuple_of_uid_tweets[0] in users_with_one_tweet_in_zoneA:
        u_tweets = list(tuple_of_uid_tweets[1])
        u_tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
        fileName = u_tweets[0]['actor']['preferredUsername'].lower()
        write_full_tweets_to_geojsonl('/data/chime/geo/users_with_a_tweet_in_zone_a/'+fileName,u_tweets)

In [None]:
#Iterate through the grouped by user tweets and if the uid matches a user with a tweet in zoneA, then write it out!
tweets_gb_user.foreach(write_out_simplified_geo_contextual)