In [1]:
import json, iso8601, pprint, os, codecs

# Load, Clean, Sort, Convert GeoLocated data collected form Twitter (GNIP)

This notebook: 

1. Loads raw gnip data
1. Writes full GNIP GEOJSONL files per user

## [Spark Status](http://epic-analytics.cs.colorado.edu:4040/jobs/)

In [2]:
input_directory  = "/data/chime/HurricaneNate/GNIP/"
output_directory = "/data/chime/geo2/NATE"

## Step 1. Load all the Tweets!

In [3]:
#Load the files
raw_strings = sc.textFile(input_directory)

#Filter out the duds
strings = raw_strings.filter(lambda x: x!="")

#JSONs
jsons  = strings.map(json.loads)

tweets = jsons.filter(lambda x: 'info' not in x.keys())

### Check that this is working so far

In [4]:
pprint.pprint(tweets.take(1)[0])

{u'actor': {u'displayName': u'Townsquare-Tusc-WX',
            u'favoritesCount': 44,
            u'followersCount': 175,
            u'friendsCount': 443,
            u'id': u'id:twitter.com:2828832719',
            u'image': u'https://pbs.twimg.com/profile_images/830328335171407872/xLrhW0lL_normal.jpg',
            u'languages': [u'en'],
            u'link': u'http://www.twitter.com/WX4ALA',
            u'links': [{u'href': u'https://radioweathernetwork.com',
                        u'rel': u'me'}],
            u'listedCount': 2,
            u'location': {u'displayName': u'Tuscaloosa, AL',
                          u'objectType': u'place'},
            u'objectType': u'person',
            u'postedTime': u'2014-10-14T02:27:05.000Z',
            u'preferredUsername': u'WX4ALA',
            u'statusesCount': 2175,
            u'summary': u"Townsquare Media-Tuscaloosa! West, Alabama's ONLY Live & Local Severe & Winter Weather Coverage By LOCAL Meteorologist; Bobby Best!",
            u'

## Step 2: Group tweets by user

In [5]:
tweets_gb_user = tweets.groupBy(lambda t: t['actor']['id'])
tweets_gb_user.cache() #We should probably cache these? If we want to use them again?

PythonRDD[7] at RDD at PythonRDD.scala:53

#### Check on the status of this operation, should see a tuple of: `(user_id, iterable)`

In [6]:
pprint.pprint(tweets_gb_user.take(1)[0])

(u'id:twitter.com:243844846',
 <pyspark.resultiterable.ResultIterable object at 0x7f494c7007d0>)


In [7]:
def writeGeoJSON(uTuple):
    u, iterable = uTuple
    tweets = list(iterable)
    handle = tweets[0]['actor']['preferredUsername']
    tweets.sort(key=lambda t: t['postedTime'])
    
    #First, check for `geo`
    features = []
    geo_count = 0;
    
    for t in tweets:
        try:
            geo = t.get('geo',None)
            if geo:
                geo_count += 1;
                geo = {'type':"Point",'coordinates':list(reversed(geo['coordinates']))}
            feat = {
                'type':'Feature',
                'geometry': geo,
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'text':t['body'],
                    'date':t['postedTime'],
                    'tweetID': t['id'].split(":")[2]
                }
            }
            features.append(feat)
        except:
            raise
    
    #Minimum tweet count?
    if geo_count > 5:
        json.dump({'type':'FeatureCollection', 'features': features},
                  codecs.open(output_directory + handle+".geojson",'w'))        

## Step 3: Write out the `geojsonl` files

For now just writing the full GNIP files, but in the future, this can probably be streamlined?

In [8]:
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

In [9]:
tweets_gb_user.foreach( writeGeoJSON )