In [45]:
import json, iso8601, pprint, os

# Load, Clean, Sort, Export from GNIP (Full Contextual Streams)

This notebook: 

1. Loads raw gnip data
1. Writes full GNIP GEOJSONL files per user

## [Spark Status](http://epic-analytics.cs.colorado.edu:4040/jobs/)

In [33]:
output_directory = "/data/chime/geo/zone_a_users_full_contextual"

In [34]:
#Load the files
raw_strings = sc.textFile("/data/chime/sandy/movement-derivation/ZoneA_Users_with_HL_GNIP/*")

#Filter out the duds
strings = raw_strings.filter(lambda x: x!="")

#JSONs
jsons  = strings.map(json.loads)

tweet_jsons = jsons.filter(lambda x: 'info' not in x.keys())

## Step 1. Load all the Tweets!
### Crucial Step:
1. The GNIP `geo` field is backwards from convention

In [35]:
def fix_geo(t):
    if 'geo' in t:
        t['geo']['coordinates'].reverse()
    return t
tweets = tweet_jsons.map(fix_geo)
tweets.cache()

PythonRDD[30] at RDD at PythonRDD.scala:53

### Check that this is working so far

In [36]:
pprint.pprint(tweets.take(1)[0])

{u'actor': {u'displayName': u'PROF | tajuAhmed |',
            u'favoritesCount': 33,
            u'followersCount': 181,
            u'friendsCount': 149,
            u'id': u'id:twitter.com:177665053',
            u'image': u'https://si0.twimg.com/profile_images/2581374515/image_normal.jpg',
            u'languages': [u'en'],
            u'link': u'http://www.twitter.com/mutalib_ahmed',
            u'links': [{u'href': None, u'rel': u'me'}],
            u'listedCount': 0,
            u'location': {u'displayName': u'Bronx - Newyork',
                          u'objectType': u'place'},
            u'objectType': u'person',
            u'postedTime': u'2010-08-12T19:09:07.000Z',
            u'preferredUsername': u'mutalib_ahmed',
            u'statusesCount': 9742,
            u'summary': u'',
            u'twitterTimeZone': u'Quito',
            u'utcOffset': u'-18000',
            u'verified': False},
 u'body': u"@pslymaabanah and broke ass niqqas' smfh",
 u'generator': {u'displayName

## Step 2: Group tweets by user

In [37]:
tweets_gb_user = tweets.groupBy(lambda t: t['actor']['id'])
tweets_gb_user.cache() #We should probably cache these? If we want to use them again?

PythonRDD[36] at RDD at PythonRDD.scala:53

#### Check on the status of this operation, should see a tuple of: `(user_id, iterable)`

In [38]:
pprint.pprint(tweets_gb_user.take(1)[0])

(u'id:twitter.com:848470111',
 <pyspark.resultiterable.ResultIterable object at 0x7f5ce2f76d90>)


## Available Export Functions (Requires Local Arrays)
(Also ALWAYS sorts by time; this could be expensive for large collections, but it's important)

In [40]:
def write_full_tweets_to_geojsonl(fileName, tweets):
    with open(fileName+'.geojsonl','w') as outFile:
        tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
        for t in tweets:
            if 'geo' in t:
                geo = t['geo']
            else:
                geo = None
            geojson = {
                'type':'Feature',
                'geometry':geo,
                'properties':t #Full GNIP Tweet in the properties
            }
            outFile.write(json.dumps(geojson)+"\n")

In [41]:
def write_simplified_tweets_to_geojsonl(fileName, tweets):
    with open(fileName+'.geojsonl','w') as outFile:
        tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
        for t in tweets:
            if 'location' in t:
                loc = t['location']
            else:
                loc = None
            if 'location' in t['actor']:
                u_loc = t['actor']['location']
            else:
                u_loc = None
            if 'geo' in t:
                geo = t['geo']
            else:
                geo = None
            geojson = {
                'type':'Feature',
                'geometry':geo,
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'uid' :t['actor']['id'],
                    'u_loc':u_loc,
                    'u_reg':t['actor']['postedTime'],
                    'u_sum':t['actor']['summary'],
                    'tid' :t['id'],
                    'loc' :loc,
                    'time':t['postedTime'],
                    'text':t['body'],
                    'source':t['generator'],
                    'verb':t['verb'],
                    'meta':t['twitter_entities'],
                    'u_utc':t['actor']['utcOffset']
                }
            }
            outFile.write(json.dumps(geojson)+"\n")

In [42]:
def write_bare_tweets_to_geojsonl(fileName, tweets):
    with open(fileName+'.geojsonl','w') as outFile:
        for t in tweets:
            geojson = {
                'type':'Feature',
                'geometry':t['geo'],
                'properties':{
                    'user':t['actor']['preferredUsername'],
                    'time':t['postedTime'],
                    'text':t['body']
                }
            }
            outFile.write(json.dumps(geojson)+"\n")

## Step 3: Write out the `geojsonl` files

For now just writing the full GNIP files, but in the future, this can probably be streamlined?

In [47]:
def write_out_full_contextual(tuple_of_uid_tweets):
    u_tweets = list(tuple_of_uid_tweets[1])
    u_tweets.sort(key=lambda t: iso8601.parse_date(t['postedTime']))
    fileName = u_tweets[0]['actor']['preferredUsername'].lower()
    write_full_tweets_to_geojsonl(output_directory+'/'+fileName,u_tweets)

In [48]:
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

In [49]:
tweets_gb_user.foreach( write_out_full_contextual )