# Filter tweets by time and type and write to to file

In [10]:
import json, datetime, iso8601, pytz
from urllib.parse import quote

In [11]:
class TweetFileHandler:
    def __init__(self):
        self.before       = open('before.geojsonl','w')
        self.during       = open('during.geojsonl','w')
        self.after        = open('after.geojsonl', 'w')
        self.before_count = 0;
        self.during_count = 0;
        self.after_count  = 0;
        self.LANDFALL = datetime.datetime(2012,10,29,20,0,0,tzinfo=pytz.utc) #Landfall as UTC
        self.AFTER    = datetime.datetime(2012,11,4,0,0,0,tzinfo=pytz.utc)  #After as UTC
        self.gnip_bounding_boxes = {}
    
    def write_before(self,tweet):
        self.before_count += 1
        self.before.write(json.dumps(tweet)+"\n")
        
    def write_during(self,tweet):
        self.during_count += 1
        self.during.write(json.dumps(tweet)+"\n")
        
    def write_after(self,tweet):
        self.after_count += 1    
        self.after.write(json.dumps(tweet)+"\n")
    
    def process_tweet(self, tweet):
        time = iso8601.parse_date(tweet['properties']['time'])
        del tweet['properties']['gnip']
        tweet['properties']['text'] = quote(tweet['properties']['text'])
        if time < self.LANDFALL:
            self.write_before(tweet)
        elif time < self.AFTER:
            self.write_during(tweet)
        else:
            self.write_after(tweet)
            
    def count_by_box(self,tweet):
        key = json.dumps(tweet['properties']['gnip']['matching_rules'])
        if key in self.gnip_bounding_boxes:
            self.gnip_bounding_boxes[key] += 1
        else:
            self.gnip_bounding_boxes[key] = 1
            
    def close(self):
        self.before.close()
        self.during.close()
        self.after.close()

In [None]:
count = 0;
threshold = 10000000
handler = TweetFileHandler()
with open('../result_data/all_tweets_one_file.jsonl','r') as inFile:
    for line in inFile:
        if count<threshold:
#             handler.count_by_box(json.loads(line.strip()))
            handler.process_tweet(json.loads(line.strip()))

        else:
            break
        count+=1
        if count%100000==0:
            print(".",end="")
            if count%10000:
                print(count,end="")
            
    handler.close()
        
print("")
print(handler.before_count)
print(handler.during_count)
print(handler.after_count)
# for k,v in handler.gnip_bounding_boxes.items():
#     print(json.loads(k)[0]['tag'], v)

In [71]:
output = {'type':'FeatureCollection','features':[]}
for k,v in handler.gnip_bounding_boxes.items():
    parsed = json.loads(k)[0]
#     print(parsed['tag'])
    bbox = parsed['value'][14:-1].split()
    ll, ur = ([float(bbox[0]),float(bbox[1])],[float(bbox[2]),float(bbox[3])])
    ul, lr = ([float(bbox[0]),float(bbox[3])],[float(bbox[2]),float(bbox[1])])
    geometry = [[ll,ul,ur, lr, ll]]
    output['features'].append({'type':'Feature',
                              'geometry':{'type':'Polygon','coordinates':geometry},
                              'properties':{'center':parsed['tag'],'tweets':v}})
with open('gnip_bboxes_with_counts.geojson','w') as outFile:
    outFile.write(json.dumps(output))