# Extract Geo Tagged Tweets from Harvey Collection

In [62]:
import json, sys, pprint, os, urllib
import pandas as pd

## First, go to epic_utils/parallel_get and run: 

    ruby get_geo.rb --start=2017-08-30 "2017 Hurricane Harvey" 24

When that finishes, run:

    cat tweets* > ~/HurricaneHarvey/new_tweets.jsonl
    
### Once that is all complete... do this:

In [44]:
these_tweets = []
count = 0;
with open('all_geo.jsonl') as inFile:
    for line in inFile:
        count += 1
        try:
            t = json.loads(line.strip())
            if 'coordinates' in t:
                if t['coordinates']:
                    these_tweets.append(t)
                    if len(these_tweets)%1000==0:
                        sys.stderr.write("\rFound {0} geotagged of {1} processed".format(len(these_tweets),count))
        except:
            print("FAIL: ")
            pprint.pprint(line.strip())
            continue
len(these_tweets)  

Found 9000 geotagged of 10935 processed

9616

In [45]:
# Read in previous geojson.
previous = json.load(open('/data/www/jennings/harvey.geojson','r'))
print("Found {0} tweets in previous file".format(len(previous['features'])))
existing_ids = [x['properties']['id'] for x in previous['features']]
print("ids: ", existing_ids[:2])

Found 9616 tweets in previous file
ids:  ['902941559062118400', '902947612852092928']


In [46]:
new_tweets = []
for x in these_tweets:
    if x['id_str'] not in existing_ids:
        new_tweets.append(x)
print("New Tweets: ",len(new_tweets))

New Tweets:  0


In [63]:
def extract_hashtags(t):
    if 'hashtags' in t['entities']:
        tags = ['#'+x['text'].lower() for x in t['entities']['hashtags']]
        return tags
    else:
        return []
def extract_media(t):
    if 'media' in t['entities']:
        media = [x['media_url'] for x in t['entities']['media']]
        return media
    else:
        return []
def tweet_to_feature(t):
    feat = {
        "type":"Feature",
        "geometry" : {"type" : "Point", "coordinates" : t['coordinates']['coordinates']},
        "properties":{
            "created_at" : t['created_at'],
            "text" : urllib.parse.quote_plus(t['text']),
            "user" : t['user']['screen_name'],
            "timestamp"  : int(pd.Timestamp(t['created_at']).timestamp()),
            "id"   : t['id_str'],
            "coords" : t['coordinates']['coordinates']
        }
    }
    for tag in extract_hashtags(t):
        feat['properties'][tag] = 1
       
    media = extract_media(t)
    if len(media) > 0:
        feat['properties']['media'] = media

    return feat

In [64]:
#previous['features'] = []

In [65]:
new_features = []
#for t in new_tweets:
for t in these_tweets:    
    new_features.append(tweet_to_feature(t))

In [66]:
collection = {"type" : "FeatureCollection",
              "features" : new_features+previous['features']}
print("New Collection tweets:  ", len(collection['features']))

New Collection tweets:   9616


In [67]:
with open('/data/www/jennings/harvey.geojson','w') as outFile:
    json.dump(collection, outFile)

In [68]:
with open("latest.geojsonl",'w') as oFile:
    for f in collection['features']:
        oFile.write(json.dumps(f)+"\n")

In [69]:
# Tile it locally
os.system("tippecanoe -Pf -Z0 -B8 -z18 -o harvey-latest.mbtiles -r1 --drop-fraction-as-needed --named-layer=harvey-tweets:latest.geojsonl")

0

In [70]:
os.system("/home/anderstj/upload-tiles.js --name=harvey-latest harvey-latest.mbtiles")

0

<br>
# DataFrame? 

In [33]:
import pandas as pd

In [34]:
df = pd.DataFrame([x['properties'] for x in collection['features']])
df.head()

Unnamed: 0,#1000yearevent,#11pm,#13lovelylavenderlady,#19h30rts,#1on1,#24kmagicworldtour,#25ago,#26agosto,#2esport,#2k17hurricaneharvey,...,#안전요원,#주황,#허리케인하비,coords,created_at,id,media,text,timestamp,user
0,,,,,,,,,,,...,,,,"[-97.396381, 27.8005828]",Wed Aug 30 17:10:18 +0000 2017,902941559062118400,,"See our latest #CorpusChristi, TX #job and cli...",1504113018,tmj_TX_transp
1,,,,,,,,,,,...,,,,"[-97.19439697, 33.00699997]",Wed Aug 30 17:34:21 +0000 2017,902947612852092928,[http://pbs.twimg.com/media/DIfqEXbW4AA_qpZ.jpg],Hurricane Harvey Leaves Pets Homeless https://...,1504114461,BLifeWestlake
2,,,,,,,,,,,...,,,,"[-96.62439728, 33.01779938]",Wed Aug 30 17:39:27 +0000 2017,902948897752002564,[http://pbs.twimg.com/media/DIfrPFuWsAMm-tB.jpg],Hurricane Harvey Leaves Pets Homeless https://...,1504114767,BLifeMurphy
3,,,,,,,,,,,...,,,,"[4.4768, 50.501]",Wed Aug 30 17:40:04 +0000 2017,902949052202983424,,⒈ #kernuitstap\n⒉ #USOpen\n⒊ #Harvey\n⒋ Housto...,1504114804,trendinaliaBE
4,,,,,,,,,,,...,,,,"[-96.865092, 32.984314]",Wed Aug 30 17:47:52 +0000 2017,902951015259017216,,For all those affected in the wake of Hurrican...,1504115272,WCTDRE


In [35]:
len(df)

9616

In [37]:
df.to_csv('/data/www/jennings/harvey_geo_tweets.csv')