In [2]:
import json, iso8601, pprint, os, codecs, pandas as pd

Load, Clean, Sort, Convert GNIP Tweets to Per User GeoJSON

This notebook:

- Loads raw gnip data
- Writes full GNIP GEOJSONL files per user

[Spark Status](http://epic-analytics.cs.colorado.edu:4040/)

## EVENT CONFIGURATION

In [14]:
input_directory  = "/data/chime/Hurricanes_HIM/geo/harvey/GNIP/"
output_directory = "/data/chime/geo2/HARVEY/"

MINIMUM_TWEET_THRESHOLD = 5
STORM_PERIOD_START = pd.Timestamp("2017-08-25T00:00:00Z")
STORM_PERIOD_END   = pd.Timestamp("2017-08-27T00:00:00Z")
(STORM_PERIOD_START, STORM_PERIOD_END)

(Timestamp('2017-08-25 00:00:00+0000', tz='UTC'),
 Timestamp('2017-08-27 00:00:00+0000', tz='UTC'))

In [15]:
#Load the files
raw_strings = sc.textFile(input_directory)
#Filter out the duds
strings = raw_strings.filter(lambda x: x!="")
#JSONs
jsons  = strings.map(json.loads)
tweets = jsons.filter(lambda x: 'info' not in x.keys())

### Step 2: Group tweets by user

In [16]:
tweets_gb_user = tweets.groupBy(lambda t: t['actor']['id'])
tweets_gb_user.cache()

PythonRDD[16] at RDD at PythonRDD.scala:48

Force Spark to Execute...

In [17]:
#tweets_gb_user.count()

Check on the status of this operation, should see a tuple of: (user_id, iterable)

In [18]:
#pprint.pprint(tweets_gb_user.take(1)[0])

### Step 3: Make GeoJSON

In [19]:
def writeGeoJSON(uTuple):
    u, iterable = uTuple
    tweets = list(iterable)
    
    #Break if there are not enough tweets
    if len(tweets) < MINIMUM_TWEET_THRESHOLD:
        return False
    
    handle = tweets[0]['actor']['preferredUsername']
    tweets.sort(key=lambda t: t['postedTime'])
    
    features = []
    in_storm_range_with_geo = 0;
    
    for t in tweets:
        try:
            geo = t.get('geo',None)
            if geo:
                time = pd.Timestamp(t['postedTime'])
                if (time > STORM_PERIOD_START and time < STORM_PERIOD_END):
                    in_storm_range_with_geo += 1;
                geo = {'type':"Point",'coordinates':list(reversed(geo['coordinates']))}
            feat = {
                'type':'Feature',
                'geometry': geo,
                'properties':{
                    'user':handle, #Use the original handle incase a name changed
                    'text':t['body'],
                    'date':t['postedTime'],
                    'tweetID': t['id'].split(":")[2]
                }
            }
            features.append(feat)
        except:
            raise
    
    #Minimum tweet count?
    if in_storm_range_with_geo >= MINIMUM_TWEET_THRESHOLD:
        json.dump({'type':'FeatureCollection', 'features': features},
                  codecs.open(output_directory + handle+".geojson",'w'))

### Step 3: Write out the geojson files (per user)
Very basic simplified tweets

In [20]:
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

In [21]:
tweets_gb_user.foreach( writeGeoJSON )