In [1]:
import altair as alt
from functools import reduce
import gc
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pysqlcipher3 import dbapi2 as sqlcipher
#import seaborn as sns
import shapely

path = os.getcwd()
print('working dir: ' + path)
#/opt/dssg-hot/notebooks/shirley

working dir: /opt/dssg-hot/notebooks/shirley


# Get and compute trip data

#### Read in hot-v3 db

In [2]:
# - $HOT_KEY exists in environment already
keynow = os.environ['HOT_KEY']
db = sqlcipher.connect('/opt/dssg-hot/data/shirleydata/sqldbs/hot-v3.db')
db.execute('pragma key=\"x\''+keynow+'\'\"')

<pysqlcipher3.dbapi2.Cursor at 0x7f9fb773bf80>

In [3]:
# - Test if the HOT_KEY password worked
db.execute('select * from census limit 1;').fetchall()

[(-9223314567028386566,
  1,
  'NEWCASTLE',
  'WA',
  98059,
  1,
  530330250051,
  33,
  25005,
  1031)]

In [None]:
# - Read in joined trip data
df = pd.read_sql_query("select trip_id, toll, entry_time, exit_time, entry_plaza, exit_plaza, is_hov, tag_id, acct, plate, id, plate_state, fips, toll_group, entry_hour, peak_group, inc_group, freq_group, frequency, dist_btwn_entry_exit_loop, dist_group, TT_GP, TT_HOV, TT_saving as tt_savings, Reliability as reliab_savings from Agg;", db)
# --> time and reliabilty savings are HOT minus GP, in units of hrs; reliab is 80th-50th percentile travel time

#### Compute groups to filter on

<i>Commerical vs. private via number of tag_ids per id</i>

In [None]:
# --> id's w/ >6 diff tag_id's = commercial

# - Note that NA groups in groupby are automatically excluded
id_tagid_cts = df.groupby('id')['tag_id'].nunique()
comm_ids = id_tagid_cts[id_tagid_cts>6].index.values
df['is_commercial_by_num_tags'] = df['id'].isin(comm_ids).astype(int)
# - OR: # dfnc = df[~df['id'].isin(comm_ids)] # df, no commercial trips

# - Test:
# print(id_tagid_cts.sort_values(ascending=False))
# print(len(comm_ids))
# print(len(id_tagid_cts))
# print(id_tagid_cts[id_tagid_cts>6].index.values)
comm_ids_test = df[df['is_commercial_by_num_tags']==0].groupby('id')['tag_id'].nunique()
# - OR: # comm_ids_test = dfnc.groupby('id')['tag_id'].nunique()
print('Test commercial vs. private classification; following # should be zero: ' + str(comm_ids_test[comm_ids_test > 6].count())) # should be zero

# - Usage:
# Get df w/o commerical by num tags --> dfnew = df[df['is_commercial_by_num_tags']==0]

<i>Commerical vs. private via number of trips per id</i>

In [None]:
# --> id's w/ >10k trips = commercial

uf_id_cts = df['id'].value_counts() # counts how many times each value appears in the column = # of trips per id
comm_by_num_trips_ids = uf_id_cts[uf_id_cts>10000].index.values
df['is_commercial_by_num_trips'] = df['id'].isin(comm_by_num_trips_ids).astype(int)

# - Test:
#print(uf_id_cts.sort_values(ascending=False))
#print(len(comm_by_num_trips_ids))
#print(len(uf_id_cts))
#print(uf_id_cts[uf_id_cts>10000].index.values)
print('Test commerical vs. private classification by num of trips; following # should be zero: ' 
      + str(sum(df[df['is_commercial_by_num_trips']==0]['id'].value_counts() > 10000))) # should be zero

# - Usage:
# Get df w/o commerical by num trips --> dfnew = df[df['is_commercial_by_num_trips']==0]

<i>Use frequency</i>

In [None]:
# --> defining 1 time users

otu_ids = uf_id_cts[uf_id_cts==1].index.values
df['is_otu'] = df['id'].isin(otu_ids).astype(int)

# - Test:
# print(uf_id_cts.sort_values(ascending=False))
# print(len(otu_ids))
# print(len(uf_id_cts))
# print(len(otu_ids)/len(uf_id_cts))
# print(uf_id_cts[uf_id_cts==1].index.values)
uf_ids_test = df[df['is_otu']==0]['id'].value_counts()
print('Test one-time use frequency classification; following # should be zero: ' + str(uf_ids_test[uf_ids_test==1].count())) # should be zero

# - Usage:
# Get df w/o one-time users --> dfnew = df[df['is_otu']==0]

#### Filter out desired user groups (currently commercial users only)

In [None]:
dfnow = df[(df['is_commercial_by_num_trips']==0) & 
           (df['is_commercial_by_num_tags']==0)]
dfnow.drop(columns=['is_commercial_by_num_trips',
                    'is_commercial_by_num_tags',
                    'is_otu'],
          inplace=True)

# Join ACS info to cbg shapefile by fips

In [2]:
# - Load census block groups
cbgs = gpd.read_file('/opt/dssg-hot/data/shapefiles/block_groups_WA/bg10.shp')
#cbgs.dtypes
#cbgs.head()
#cbgs.crs
cbgs = cbgs[['GEOID10','geometry']]
cbgs.rename(columns={'GEOID10':'fips'}, inplace=True)
cbgs['fips'] = pd.to_numeric(cbgs['fips'])
cbgs['geometry'] = cbgs['geometry'].to_crs(epsg=4326)

In [3]:
# - Load ACS 2016 info
acs = pd.read_csv('/opt/dssg-hot/data/acs/block_group_census_estimates_wide_original_bins_all_WA.csv')
acs.rename(columns={'fips_code':'fips'}, inplace=True)
#acs.dtypes

In [4]:
cbgs_acs = cbgs.merge(acs, on='fips')

# Load in shapefiles for drawing maps

#### Load in toll points shapefile

In [7]:
tollptsnb = gpd.read_file('/opt/dssg-hot/data/shapefiles/toll_pts_405/northbound_405_toll_pts.shp')
#print(tollptsnb.crs)
#print(tollptsnb.head())

majtpsnb = tollptsnb[(tollptsnb['Name']=='NB Entry 1 (entry plaza = NB1 aka 3)')
                |(tollptsnb['Name']=='NB Exit 7 (exit plaza = NB10 aka 12)')]
#majtpsnb.plot()

In [8]:
tollptssb = gpd.read_file('/opt/dssg-hot/data/shapefiles/toll_pts_405/southbound_405_toll_pts.shp')
#print(tollptssb.crs)
#print(tollptssb.head())

majtpssb = tollptssb[(tollptssb['Name']=='SB Entry 1 (entry plaza = SB1 aka 13)')
                |(tollptssb['Name']=='SB Exit 7 (exit plaza = SB10 aka 23)')]
#majtpssb.plot()

#### Load in city points shapefile

In [9]:
citypts = gpd.read_file('/opt/dssg-hot/data/shapefiles/city_points/city_points.shp')
#print(citypts.crs)
citypts['geometry'] = citypts['geometry'].to_crs(epsg=4326)
#print(citypts.head())

#majcps = citypts[(citypts['MajorCity']=='yes')]
majcps = citypts[(citypts['NAME']=='Seattle')|(citypts['NAME']=='Bellevue')
                 |(citypts['NAME']=='Lynnwood')|(citypts['NAME']=='Woodinville')
                 |(citypts['NAME']=='Bothell')|(citypts['NAME']=='Shoreline')
                 |(citypts['NAME']=='Edmonds')|(citypts['NAME']=='Redmond')
                 |(citypts['NAME']=='Kirkland')|(citypts['NAME']=='Renton')
                 |(citypts['NAME']=='Kent')|(citypts['NAME']=='Issquah')
                 |(citypts['NAME']=='Everett')|(citypts['NAME']=='SeaTac')]
# ax = majcps.plot()
# for x, y, label in zip(majcps.geometry.x, majcps.geometry.y, majcps.NAME):
#     ax.annotate(label, xy=(x, y), xytext=(3, 3), textcoords="offset points")

In [12]:
citypts.head()

Unnamed: 0,OBJECTID,NAME,CountySeat,GNIS,LastUpdate,MajorCity,CountyFIPS,CityFIPS,geometry
0,1,Sumas,,2412000.0,2009-08-31,,73,5368330WA,POINT (-122.2649235592344 49.0000469267051)
1,2,Blaine,,2409860.0,,,73,5306505WA,POINT (-122.7516074291498 48.99389831633226)
2,3,Lynden,,2410899.0,2009-02-28,yes,73,5340805WA,POINT (-122.4481556075223 48.94426422731341)
3,4,Oroville,,2411338.0,2009-08-31,,47,5351970WA,POINT (-119.4352701819601 48.93911891696532)
4,5,Nooksack,,2411263.0,,,73,5349275WA,POINT (-122.3221118819945 48.9201504330441)


#### Load in roads shapefile

In [10]:
rds500k = gpd.read_file('/opt/dssg-hot/data/shapefiles/roads500k/sr500k_20181231.shp')
#print(rds500k.crs)
rds500k['geometry'] = rds500k['geometry'].to_crs(epsg=4326)
#print(rds500k.head())

majrds = rds500k[(rds500k['StateRoute']=='405')|(rds500k['StateRoute']=='005')
                |(rds500k['StateRoute']=='522')|(rds500k['StateRoute']=='520')
                |(rds500k['StateRoute']=='527')|(rds500k['StateRoute']=='90')]
#majrds.plot()

#### Load in water shapefile

In [11]:
water = gpd.read_file('/opt/dssg-hot/data/shapefiles/water/water_bodies.shp')
#print(water.crs)
#print(water.head())

majlks = water[(water['NAME']=='Green Lake')|(water['NAME']=='Lake Union')
                |(water['NAME']=='Lake Meridian')|(water['NAME']=='Salmon Bay')
                |(water['NAME']=='Lake Washington')|(water['NAME']=='Lake Sammamish')
                |(water['NAME']=='Lake Youngs')|(water['NAME']=='Lake Stevens')]
#majlks.plot()

In [None]:
otherwater = gpd.read_file('/opt/dssg-hot/data/shapefiles/more_water/wtrbdy_area.shp')
pugetsound = otherwater[otherwater['NAME']=='Puget Sound']
#print(otherwater.crs)
#print(otherwater.head())