In [None]:
import glob
import json
from pipeline import PreparedTSD
from tqdm import tqdm_notebook
from contextlib import closing

# These are TSD identifiers that sound like multimodal bus/train trips
excluded_pks = """03fb3470-ea49-4ed7-be56-66134611938e
0444cb36-3db7-402d-9b80-85f2b3113993
0519ec56-c012-417c-b5e1-a2c8e7f080ae
147bb692-f0b9-4e8c-a7ba-7a077043aada
197a8e7b-7cfc-4688-b79d-2275756cb5a1
23895c06-cc06-4e24-ac9a-95429da4c4b5
28d6fbe9-ebc9-42fc-816b-eb9476367719
333ebd72-51ea-4e40-a7f8-f532469ed81d
3f8551d5-5723-4fc6-bb2d-7775b463aabf
3fff1a8b-32bc-4fd9-9a03-c180874c87f9
412a8bde-bbe1-4c15-9229-00813cc5e3b9
41c1a396-c482-485a-b0d2-4bd378a6c655
4b834e98-4898-415a-a0cd-886aa7216d36
7a87ac6b-c092-49e1-8eb8-e4a064aadca9
7a81fbef-aced-4ca5-ae2c-48ed3a0ab903
8d0058a4-b719-4407-9a13-c54095302559
8d1af856-8ba0-477f-a561-e5aa76c53313
91bf0bc5-3344-45d1-a347-76ee38adb2e2
924535c8-e86b-4a2d-8977-e2a0a2797b80
a4d166df-8a53-41c1-87d4-75cda9def7b3
a96b3c4c-557d-4ee8-8b59-03d2b7d55c58
a9a0dbbd-c5c2-4928-80a0-6170109d96b9
aa611005-c2df-4e8a-9de9-8dcb79c6b0ad
b8d552b9-7bdb-463d-829d-d3da97d95798
c2b68ff6-8720-4ad6-8606-24aad26bcd9c
c351ba31-fbda-450b-b565-cf1e95760337
d4c68e11-5c45-4090-82d2-d4a431b43bc7
e43bcfa1-5dd7-47d3-b5cb-bdccf8d96294
e894c2a4-901a-4727-aadf-e7de8d360b59
ed5bb6e0-bf4f-4877-ad5e-88556b47dbfd""".split("\n")

def process_tsd_from_filename(fname):
    with open(fname) as f:
        try:
            try:
                d = json.load(f)
                tsd = PreparedTSD(d)
            except:
                return None
                        
            if tsd.reportedActivityType == 5:
                if tsd.notes and tsd.trip_pk in excluded_pks:
                    print "Excluded bus trip: " + tsd.notes
                    return None
            
            if "run" in tsd.notes and tsd.reportedActivityType == 4:
                print "Reclassified {} {} to 1: {}".format(fname, tsd.reportedActivityType, tsd.notes)
                tsd.reportedActivityType = 1                
                        
            if tsd.reportedActivityType == -1:
                return None
            if len(tsd.events) == 0:
                return None
                
            return tsd

        except (IndexError, KeyError) as e:
            return
        except Exception as e:
            print "Unknown exception: {}".format(e)
            import traceback
            traceback.print_exc(e)
            raise
        

glob_expr = './data/tsd.*.jsonl'
files = glob.glob(glob_expr)
tsds = {}
from multiprocessing import Pool
pool = Pool()
try:
    num_to_process = len(files)
    # num_to_process = 200
    for tsd in tqdm_notebook(pool.imap_unordered(process_tsd_from_filename, files[:num_to_process]), total = num_to_process):
        if tsd is not None:
            tsds[tsd.trip_pk] = tsd
finally:
    pool.terminate()
    

print "Skipped {} trips".format(num_to_process - len(tsds))            
    
# print sorted([tsd.trip_pk for tsd in tsds.values()])
# import pickle

# with open('./data/tsds.pickle', 'w') as outfile:
#     pickle.dump(tsds, outfile, pickle.HIGHEST_PROTOCOL)

In [None]:
trusted_tsds = set()
for pk, tsd in tsds.iteritems():
    is_bus_or_run = (tsd.reportedActivityType in (1, 5))
    if is_bus_or_run:
        trusted_indices = []
        for index, event in enumerate(tsd.events):
            running_slow = (tsd.reportedActivityType == 1 and event.averageSpeed < 1.5)
            bus_slow = (tsd.reportedActivityType == 5 and event.averageSpeed < 2.2)
            if not (running_slow or bus_slow):
                trusted_indices.append(index)
        tsd.trustedEventIndices = trusted_indices
        trusted_tsds.add(tsd)

total_event_count = sum(len(tsd.events) for tsd in tsds.itervalues())
trusted_event_count = sum(len(tsd.trustedEventIndicies) for tsd in trusted_tsds)
print "Got {} trusted events ({} events total)".format(trusted_event_count, total_event_count)

In [None]:
import os
import pickle
pickle_filename = './trusted_tsds.pickle'
with open(pickle_filename, 'wb') as f:
    pickle.dump(trusted_tsds, f, pickle.HIGHEST_PROTOCOL)
    
print "pickle size: {:.4f} MB".format(os.path.getsize(pickle_filename) / (2.**20))