In [2]:
import json, iso8601, fiona, pprint

# Load, Clean, Sort, Export from GNIP

This notebook: 

1. Loads raw gnip data
1. Filters for a geotag
1. Identifies all tweets within an area of interest
1. Finds Unique Users
1. Writes full GNIP GEOJSONL files per user for users with at least 1 tweet in the area of interest

## [Spark Status](http://epic-analytics.cs.colorado.edu:4040/jobs/)

In [5]:
#Load the files
raw_strings = sc.textFile('/data/chime/matthew/gnip-geo/*/*')

#Filter out the duds
strings = raw_strings.filter(lambda x: x!="")

#JSONs
jsons  = strings.map(json.loads)

tweet_jsons = jsons.filter(lambda x: 'info' not in x.keys())

## Step 1. Load all the Tweets!
### 2 Crucial Steps
1. A lot of tweets do not actually have lat/lon that will work for our purposes
1. The GNIP `geo` field is backwards from convention

In [6]:
def fix_geo(t):
    t['geo']['coordinates'].reverse()
    return t

tweets_with_geo = tweet_jsons.filter(lambda t: 'geo' in t.keys())

geo_tweets = tweets_with_geo.map(fix_geo)
# geo_tweets.cache()

### Check that this is working so far

In [7]:
pprint.pprint(geo_tweets.take(1)[0])

{u'actor': {u'displayName': u'Francisco Aguirre',
            u'favoritesCount': 16,
            u'followersCount': 161,
            u'friendsCount': 391,
            u'id': u'id:twitter.com:304010396',
            u'image': u'https://pbs.twimg.com/profile_images/378800000377728270/38ef32f591f5da633484526d17bf9bfa_normal.jpeg',
            u'languages': [u'es'],
            u'link': u'http://www.twitter.com/fraconicr',
            u'links': [{u'href': None, u'rel': u'me'}],
            u'listedCount': 2,
            u'location': {u'displayName': u'Tifton, GA',
                          u'objectType': u'place'},
            u'objectType': u'person',
            u'postedTime': u'2011-05-23T19:45:25.000Z',
            u'preferredUsername': u'fraconicr',
            u'statusesCount': 549,
            u'summary': None,
            u'twitterTimeZone': u'Mountain Time (US & Canada)',
            u'utcOffset': u'-21600',
            u'verified': False},
 u'body': u'Bendiciones!!! @ Iglesia VID

## Optional: Export subsets of the data as `.geojsonl` files
This is not by user, this is all tweets in one file (i.e., BIG)

## Step 2: Group tweets by user

In [8]:
tweets_gb_user = geo_tweets.groupBy(lambda t: t['actor']['id'])
tweets_gb_user.cache() #We should probably cache these? If we want to use them again?

PythonRDD[7] at RDD at PythonRDD.scala:53

#### Check on the status of this operation, should see a tuple of: `(user_id, iterable)`

In [9]:
pprint.pprint(tweets_gb_user.take(1)[0])

(u'id:twitter.com:1637120388',
 <pyspark.resultiterable.ResultIterable object at 0x7f655c1bc250>)


In [43]:
def write_out_simplified_geo_contextual(tuple_of_uid_tweets):
    u_tweets = list(tuple_of_uid_tweets[1])
    u_tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
    fileName = u_tweets[0]['actor']['preferredUsername'].lower()
    write_simplified_tweets_to_geojsonl('../working_data/simplified_geo_contextual_all_users/'+fileName,u_tweets)

### Optional: Write the `geo-tweet-streams` for EVERY user 

## Step 3: Geographic Filtering

Load the shapefile for Evacuation Zone (or whatever your bounds should be)

In [72]:
from shapely.geometry import mapping, shape
import fiona
c = fiona.open('../EvacuationZones/FL_EvacZones/martin_palm_beach_inland_hull.geojson','r')
pol = c.next()
geom = shape(pol['geometry'])
geom.is_valid

True

In [73]:
#Set as a broadcst variable for spark
zoneBroadcast = sc.broadcast(geom)
#Testing (For ZoneA):
in_bounds = shape({'type': "Point", 'coordinates': [-81.6543,30.3181]})
print in_bounds
print "TRUE?", zoneBroadcast.value.contains(in_bounds)
print "TRUE?", in_bounds.within(zoneBroadcast.value)

out_of_bounds = shape({"type": "Point","coordinates": [-73.99753114562988,40.73093368341445]})
print out_of_bounds
print "FALSE?", zoneBroadcast.value.contains(out_of_bounds)

POINT (-81.65430000000001 30.3181)
TRUE? False
TRUE? False
POINT (-73.99753114562988 40.73093368341445)
FALSE? False


In [74]:
#Create a function that takes an iterable and filters for users with a tweet in zoneA
def has_tweet_in_bounds(iterable):
    for t in iterable:
        if zoneBroadcast.value.contains( shape(t['geo']) ):
            return True
    return False

### Run the filter for all users

In [75]:
user_tweets_with_a_tweet_in_zone_A = tweets_gb_user.filter(lambda _: has_tweet_in_bounds(_[1]))
# user_tweets_with_a_tweet_in_zone_A.cache() #Probably not necessary; unless we want to do more with it

### Before we actually process any of this, let's figure out what we're going to do with it...

In [76]:
outdir = '/data/chime/geo/matthew_martin_palm_beach_inland'

if not os.path.exists(outdir):
    os.mkdir(outdir)

In [77]:
def write_user_tweet_tuple_to_full_geojsonl(tuple_of_uid_tweets):
    u_tweets = list(tuple_of_uid_tweets[1])
    #We are not guaranteeing it's sorted by time here.
    fileName = u_tweets[0]['actor']['preferredUsername'].lower() #Grab the username from the first tweet
    write_full_tweets_to_geojsonl(outdir+'/'+fileName,u_tweets)

In [78]:
#Boom
user_tweets_with_a_tweet_in_zone_A.foreach(write_user_tweet_tuple_to_full_geojsonl)

## Available Export Functions (Requires Local Arrays)
(Also ALWAYS sorts by time; this could be expensive for large collections, but it's important)

In [21]:
def write_full_tweets_to_geojsonl(fileName, tweets):
    with open(fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':t #Full GNIP Tweet in the properties
            }
            outFile.write(json.dumps(geojson)+"\n")

In [22]:
def write_simplified_tweets_to_geojsonl(fileName, tweets):
    with open('../working_data/'+fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            if 'location' in t:
                loc = t['location']
            else:
                loc = None
            if 'location' in t['actor']:
                u_loc = t['actor']['location']
            else:
                u_loc = None
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'uid' :t['actor']['id'],
                    'u_loc':u_loc,
                    'u_reg':t['actor']['postedTime'],
                    'u_sum':t['actor']['summary'],
                    'tid' :t['id'],
                    'loc' :loc,
                    'time':t['postedTime'],
                    'text':t['body'],
                    'source':t['generator'],
                    'verb':t['verb'],
                    'meta':t['twitter_entities'],
                    'u_utc':t['actor']['utcOffset']
                }
            }
            outFile.write(json.dumps(geojson)+"\n")

In [23]:
def write_bare_tweets_to_geojsonl(fileName, tweets):
    with open('../working_data/'+fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'time':t['postedTime'],
                    'text':t['body']
                }
            }
            outFile.write(json.dumps(geojson)+"\n")

# Deprecated...

In [None]:
#Filter for those tweets in ZoneA
inZoneA = tweets_with_geo.filter(lambda t: zoneABroadcast.value.contains( shape(t['geo']) ) )

In [None]:
#Group by user (for inside of Zone A)
inZoneA_gb_user = inZoneA.groupBy(lambda t: t['actor']['id']).collect()

In [None]:
users_with_at_least_one_tweet_in_zoneA = [u[0] for u in inZoneA_gb_user]
len(users_with_at_least_one_tweet_in_zoneA)

In [None]:
# json.dump(users_with_at_least_one_tweet_in_zoneA, open('../working_data/users_with_at_least_one_tweet_in_zoneA.json','wb'))

### Write out the ZoneA tweets, but first, ensure it's sorted by time

In [None]:
print(tweets[0]['postedTime'])
iso8601.parse_date(tweets[0]['postedTime'])

In [None]:
for (user, tweets) in inZoneA_gb_user:
    tweets = [t for t in tweets]
    userName = tweets[0]['actor']['preferredUsername'].lower()
    write_full_tweets_to_geojsonl('../working_data/tweets_in_zone_a_by_user/'+userName, tweets)

### Get all Tweets from these Users (Beyond just those in ZoneA)

In [45]:
users_with_one_tweet_in_zoneA = json.load(open('../working_data/users_with_at_least_one_tweet_in_zoneA.json'))
len(users_with_one_tweet_in_zoneA)

21951

In [45]:
def write_out_simplified_geo_contextual(tuple_of_uid_tweets):
    if tuple_of_uid_tweets[0] in users_with_one_tweet_in_zoneA:
        u_tweets = list(tuple_of_uid_tweets[1])
        u_tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
        fileName = u_tweets[0]['actor']['preferredUsername'].lower()
        write_full_tweets_to_geojsonl('/data/chime/geo/users_with_a_tweet_in_zone_a/'+fileName,u_tweets)

In [None]:
#Iterate through the grouped by user tweets and if the uid matches a user with a tweet in zoneA, then write it out!
tweets_gb_user.foreach(write_out_simplified_geo_contextual)