## Load Contextual Plus Streams and do text searches

In [2]:
import os, sys, gzip, json, pprint, pymongo, pandas as pd
from datetime import datetime
from pymongo import MongoClient

In [189]:
EVENTS = {
    'harvey' : '/data/chime/Hurricanes_HIM/contextual-plus/OfficialSources_HIM/GNIP/', #HarveyIrma CP
    'matthew': '/data/chime/matthew/gnip/',              # This is all keyword collection from Matthew
    'matthew-geo' : '/data/chime/matthew/gnip-geo/all/' # This is all tweets from within the windswath of Matthew
}

In [190]:
def simplify_tweet(t):
    '''Simplify tweet to only specified attributes (e.g. id, text, date, user attributes, etc.)'''
    new_tweet = {'id':t['id'].split(':')[2],
                 'text':t['body'].replace("\n"," ").replace("\r"," ").replace("\t"," "),
                 'created_at':datetime.strptime(t['postedTime'], "%Y-%m-%dT%H:%M:%S.%fZ"),
                 'user':t['actor']['preferredUsername'].replace("\t"," "),
                 'verb':t['verb']
                }        
    return(new_tweet)

In [191]:
def gzip_to_full_tweet(file):
    tweets          = []
    tweet_count     = 0
    error_count     = 0
    info = None
    rt_count = 0
    
    with gzip.open(file, 'rb') as f:
        file_content = f.read()
        
        for idx, line in enumerate(file_content.decode().split("\n")):
            tweet_count += 1
            try:
                t = json.loads(line.strip())
            except json.JSONDecodeError:
                if(line==""):
                    pass
                
            if 'info' in t:
                info = t
            else:
                if t['verb'] != 'share':
                    t = simplify_tweet(t)
                    tweets.append(t)
                else:
                    rt_count+=1

            if idx%1000==0:
                sys.stderr.write("\r{0} tweets parsed      ".format(tweet_count))
#     sys.stderr.write("\r{0} tweets parsed     ".format(tweet_count))
    return tweets, info, rt_count

In [192]:
def get_tweets(base_dir, limit=None):
    
    gnip_files = os.listdir( base_dir )
    print("Found {0} GNIP files, limit: {1}".format(len(gnip_files), limit))
    
    t_count = 0
    rt_count_total = 0
    tweet_array = []
    for idx, file in enumerate(gnip_files[:limit]):
        tweets, info, rt_count = gzip_to_full_tweet(base_dir + file)
        t_count += info['info']['activity_count']
        tweet_array += tweets
        rt_count_total += rt_count
        if idx%2==0:
            sys.stderr.write("\r"+" "*80 + "{0} files; {1} tweets".format(idx+1, t_count))
    print("{0} files processed with {1} tweets, skipped {2} retweets".format(idx+1, t_count, rt_count_total))
    return tweet_array

In [193]:
TWEETS = get_tweets(EVENTS['matthew-geo'], None)

Found 33486 GNIP files, limit: None


                                                                                33473 files; 3432999 tweets

33486 files processed with 3434483 tweets, skipped 0 retweets


1 tweets parsed      1 tweets parsed                                                                                      33475 files; 3433184 tweets1 tweets parsed      1 tweets parsed                                                                                      33477 files; 3433406 tweets1 tweets parsed      1 tweets parsed                                                                                      33479 files; 3433918 tweets1 tweets parsed      1 tweets parsed                                                                                      33481 files; 3433931 tweets1 tweets parsed      1 tweets parsed                                                                                      33483 files; 3434070 tweets1 tweets parsed      1 tweets parsed                                                                                      33485 files; 3434409 tweets1 tweets parsed      

In [None]:
df = pd.DataFrame(TWEETS)
df['date'] = df.created_at.apply(lambda x: x.date())
print(len(df))

# Some DataFrame Analysis

In [183]:
HARVEY_WORDS_160 = ['water','helps','community','storm','call','prepared','informed','hurricane','rains','stay','phone','talking','evacuate','homes','news','leave','rescue','tornado','food','boat','family','neighborhood','church','neighbor','media','daughter','fema','bayou','freeway','warnings','trucks','facebook','weather','emergency','husband','lights','radio','supplies','plans','money','power','channel','stations','wind','damage','forecast','road','volunteers','decision','shelter','driving','ditches','county','situation','devastation','messaging','roofs','alerts','friends','gulf','police','rita','cell','texas','lake','television','seniors','survival','insurance','rising','worry','stuck','drain','electricity','fight','highway','announcing','drainage','governments','grocery','organization','pumps','elders','safe','decide','higher','officials','afraid','danger','fire','horrible','katrina','mayor','rainfall','surge','trapped','deep','disabled','leaders','strong','blowing','helicopter','identifying','relatives','bayous','blessed','contact','hospital','praying','residents','resources','batteries','canned','charge','drown','frustrating','hispanic','hurt','latino','mexican','nature','predict','response','risk','river','univision','attic','deaths','floating','generator','immigrant','preparedness','route','safety','tide','tropical','agency','coverage','disaster','doctors','faith','mandatory','medicine','meteorologist','reservoirs','authorities','broadcasting','canal','dark','engineers','hotel','inundated','landlines','levies','pets','screaming','stranded','underwater','escape']
HARVEY_WORDS_92  = ['water','helps','community','storm','call','prepared','hurricane','rains','stay','phone','talking','evacuate','homes','news','leave','rescue','tornado','boat','family','church','neighbor','media','fema','bayou','warnings','facebook','weather','emergency','lights','radio','plans','money','power','channel','wind','damage','forecast','decision','shelter','devastation','alerts','gulf','police','rita','cell','lake','seniors','rising','stuck','drain','electricity','drainage','governments','safe','decide','higher','officials','afraid','danger','katrina','mayor','rainfall','surge','trapped','deep','disabled','leaders','strong','bayous','blessed','hospital','praying','frustrating','mexican','risk','river','univision','generator','safety','tide','agency','disaster','faith','mandatory','medicine','reservoirs','hotel','levies','pets','stranded','underwater']

In [184]:
df['harvey160'] = df.text.apply(lambda text: pd.Series([x in text for x in HARVEY_WORDS_160]).any())
df['harvey92']  = df.text.apply(lambda text: pd.Series([x in text for x in HARVEY_WORDS_92]).any())

In [185]:
df.head()

Unnamed: 0,created_at,id,text,user,verb,date,harvey160,harvey92
0,2017-09-09 14:10:02,906520074135019522,@MiamiBeachPD @MiamiBeachNews @KyungLahCNN @Ma...,carolem65,post,2017-09-09,False,False
1,2017-09-09 14:10:24,906520167764684800,@weatherchannel Irma hitting Ft Meyers Naples....,LorraineVoytko,post,2017-09-09,True,True
2,2017-09-09 14:10:36,906520216221437953,@JohnMoralesNBC6 Thank you 🐐,Mike91889,post,2017-09-09,False,False
3,2017-09-09 14:10:38,906520226510106625,@CNN Prayers 🙏🏼,PaigeSampere,post,2017-09-09,False,False
4,2017-09-09 14:10:19,906520144607723520,@MarcoIslandPD If on patrol cld u check buildi...,gifford00,post,2017-09-09,False,False


In [220]:
print("All Tweets: {0} | Users: {1}".format(len(df), df.user.nunique()))
print("Tweets in the Harvey 160 Collection: ", len(df.query('harvey160')))
print("Users in the Harvey 160 Collection: ", df.query('harvey160').user.nunique())

All Tweets: 1910995 | Users: 526317
Tweets in the Harvey 160 Collection:  532749
Users in the Harvey 160 Collection:  219723


In [222]:
print("All Tweets: {0} | Users: {1}".format(len(df), df.user.nunique()))
print("Tweets in the Harvey 92 Collection: ", len(df.query('harvey92')))
print("Users in the Harvey 92 Collection: ", df.query('harvey92').user.nunique())

All Tweets: 1910995 | Users: 526317
Tweets in the Harvey 92 Collection:  458242
Users in the Harvey 92 Collection:  197221


532749 1910995 27.878094919138984


# Now for Matthew

In [195]:
matthew = pd.DataFrame(TWEETS)
matthew['date'] = matthew.created_at.apply(lambda x: x.date())
print(len(df))

1910995


In [196]:
matthew.head()

Unnamed: 0,created_at,id,text,user,verb,date
0,2016-08-26 16:02:24,769203351401689088,"I'm at Kebab Café in Fort-de-France, Martiniqu...",Ced_BM,post,2016-08-26
1,2016-08-26 16:02:08,769203282514440193,I don't want Hollywood I want Forbes. #positiv...,greggar_d,post,2016-08-26
2,2016-08-26 16:04:23,769203849420738561,If Martinique nice one more time will have to ...,FRASTWINZMUSIC,post,2016-08-26
3,2016-08-26 16:06:31,769204385910063104,I'm at Chez Moi 😼 https://t.co/ocTN8osd3h,Amigo_stn,post,2016-08-26
4,2016-08-26 16:09:04,769205026858336256,Happy birthday to KING Valmix 👑 @djvalmix #la...,ZoekikiAyiti,post,2016-08-26


In [197]:
MATTHEW_WORDS_160 = ['storm','flooding','house','waters','surge','evacuate','hurricane','information','rains','country','community','level','leave','island','news','weather','coast','moving','damage','risk','forecast','neighborhood','power','rising','zone','media','decision','preparing','hotel','marsh','drain','river','emergency','mandatory','scary','wind','government','tidal','tornado','worst','raise','coastal','events','fema','highest','scare','service','block','drainage','causeway','bottom','hill','katrina','traffic','urgency','management','neighbor','worried','hospital','television','tropical','voluntary','warning','alert','barrier','broadcasting','danger','generators','landmark','newspaper','ocean','climate','computer','disability','nature','parents','plain','radar','roof','sand','underwater','channel','disaster','lower','meetings','notice','pollution',' telephone','together','afraid','approaching','electricity','guard','meteorologist','reporting','survive','warming',' wave','commissioners','connect','country','dark','deep','engineering','gulf','lake','message','patients','responsibility','roots','animals','citizens','depot','devastating','died','dogs','drove','environment','googling','intertidal','landscape','nursing','panic','police','pump','repaired','resources','respect','sewer','siren','soaked','source','spread','stuck','tributaries','washed','website','announcements','authority','break','buses','cellphone','cuts','debris','disadvantage','downstream','extreme','fire','food','foundation','hazard','marshside','recovery','shaking','swept','terrible','shelter','threat','trouble','uncertainty']
MATTHEW_WORDS_95  = ['storm','flooding','house','waters','surge','evacuate','hurricane','information','rains','community','leave','maps','island','news','weather','coast','damage','risk','forecast','power','rising','media','decision','preparing','emergency','mandatory','scary','wind','government','tidal','tornado','coastal','fema','scare',' service','drainage','causway','katrina','traffic','urgency','managment','mother','nieghbor','hospital','television','tropical','volutary','warning','alert','broadcasting','danger','generators','landmark','newspaper','climate',' computer','parents','radar','roof','underwater','airport','disaster','pollution','telephone','afraid','appraoching','electricity','meteorologist','survive',' wave','commissioners','gulf','devastating','died','environmet','googling','intertidal','panic','police','resources','siren','soaked','stuck','website','announcements','authority','buses',' cellphone',' debris','extreme','fire','hazard','recovery','terrible','shelter']

In [198]:
matthew['matthew160'] = matthew.text.apply(lambda text: pd.Series([x in text for x in MATTHEW_WORDS_160]).any())
matthew['matthew92']  = matthew.text.apply(lambda text: pd.Series([x in text for x in MATTHEW_WORDS_95]).any())

In [200]:
matthew.head()

Unnamed: 0,created_at,id,text,user,verb,date,matthew160,matthew92
0,2016-08-26 16:02:24,769203351401689088,"I'm at Kebab Café in Fort-de-France, Martiniqu...",Ced_BM,post,2016-08-26,False,False
1,2016-08-26 16:02:08,769203282514440193,I don't want Hollywood I want Forbes. #positiv...,greggar_d,post,2016-08-26,True,True
2,2016-08-26 16:04:23,769203849420738561,If Martinique nice one more time will have to ...,FRASTWINZMUSIC,post,2016-08-26,False,False
3,2016-08-26 16:06:31,769204385910063104,I'm at Chez Moi 😼 https://t.co/ocTN8osd3h,Amigo_stn,post,2016-08-26,False,False
4,2016-08-26 16:09:04,769205026858336256,Happy birthday to KING Valmix 👑 @djvalmix #la...,ZoekikiAyiti,post,2016-08-26,False,False


In [218]:
print("All Tweets: {0} | Users: {1}".format(len(matthew), matthew.user.nunique()))
print("Tweets in the Matthew 160 Collection: ", len(matthew.query('matthew160')))
print("Users in the Matthew 160 Collection: ", matthew.query('matthew160').user.nunique())

All Tweets: 3434483 | Users: 230807
Tweets in the Matthew 160 Collection:  335858
Users in the Matthew 160 Collection:  73949


In [219]:
print("All Tweets: {0} | Users: {1}".format(len(matthew), matthew.user.nunique()))
print("Tweets in the Matthew 92 Collection: ", len(matthew.query('matthew92')))
print("Users in the Matthew 92 Collection: ", matthew.query('matthew92').user.nunique())

All Tweets: 3434483 | Users: 230807
Tweets in the Matthew 92 Collection:  188716
Users in the Matthew 92 Collection:  50004
