In [None]:
import numpy as np
import pandas as pd
import geopandas as gp
import os
import subprocess
import shapely as shp
import pylab as py

from datetime import datetime
from itertools import compress
from fiona.crs import from_epsg

%pylab inline

Code was written on the PUI2016_Python2 Kernel


Read in citibike data

In [None]:
bike = pd.DataFrame()


In [None]:
#Change Range to load all 
###WARNING. LOAD ALL CITIBIKE SETS TAKES A LONG TIME###
for i in range (1,2):
    if i < 10:
        getbike = pd.read_csv(os.getenv('PUIDATA') + "/" + ('20140' + str(i)+ '-citibike-tripdata.csv'))
        bike = getbike.append(getbike, ignore_index = True) 
    else:
        getbike = pd.read_csv(os.getenv('PUIDATA') + "/" + ('20140' + str(i)+ '-citibike-tripdata.csv'))
        bike = getbike.append(getbike, ignore_index = True) 

#Check the data

In [None]:
bike.head(2)

In [None]:
len(bike)

Keep Only Subscribers

In [None]:
bike2 = bike[bike['usertype'] == 'Subscriber']

In [None]:
bike2.head(2)

In [None]:
len(bike2)

Keep only stations open in January 2014 or earlier

In [None]:
jan_stations = pd.read_csv(os.getenv('PUIDATA') + "/" + ('20140' + str(1)+ '-citibike-tripdata.csv'))

Get the stations that existing in Jan by ID (assuming every station waas used at least once)

In [None]:
oldstations = jan_stations['start station id'].unique()

In [None]:
len(jan_stations['start station id'].unique())

Cut out stations 

In [None]:
bike3 = bike2[np.in1d(bike2['start station id'], oldstations)]

In [None]:
len(bike3['start station id'].unique())

Convert to timestamp format

In [None]:
bike3['timestamp'] = pd.to_datetime(bike3["starttime"])

Get the Day from the time

In [None]:
bike3['dow'] = bike3['timestamp'].dt.dayofweek 

I just want the weekdays

In [None]:
bike3 = bike3[bike3['dow'] <= 4]

I have to set the index equal to the timestamp to seperate times out.

In [None]:
bike3.index = bike3.timestamp


Keep only the time I want

In [None]:
bikecommute = bike3.between_time('5:00:00','12:00:00', include_start=True, include_end=True)

#reset the index. 
bikecommute = bikecommute.reset_index(drop = True)

In [None]:
bikecommute.head(2)


Create the stations dataframe

In [None]:
stations = bikecommute[['start station id','start station latitude','start station longitude']]

In [None]:
stations.head()

Drop any duplicates
https://pandas-docs.github.io/pandas-docs-travis/generated/pandas.Series.drop_duplicates.html

In [None]:
stations.drop_duplicates(inplace = True)

In [None]:
stations.head(7)

Set the DataFrame for Rides. Get count in the morning
https://pandas-docs.github.io/pandas-docs-travis/generated/pandas.DataFrame.count.html


In [None]:
commutetrips = bikecommute[['start station id', 'bikeid']] 
#rename the columns
commutetrips.columns = ['start station id', 'rides']
commutetrips = commutetrips.groupby(['start station id'], as_index = False).count()


In [None]:
commutetrips.head()

Add the rides the station 

https://pandas-docs.github.io/pandas-docs-travis/generated/pandas.DataFrame.merge.html

In [None]:
stations = stations.merge(commutetrips, how = 'outer', left_on = 'start station id', right_on = 'start station id')

In [None]:
stations.head(5)

Rename the columns so that I can use them below

In [None]:
stations.columns = ['start_station', 'lat', 'lon', 'rides']

In [None]:
stations.head(1)

Combine Lat and lon into a shapely point

From Dr. Kashuk's Lab

https://github.com/fedhere/PUI2016_fb55/blob/master/Lab9_SRK325/GeospatialAnalysis_CitiBike.ipynb

In [None]:
# combine lat and lon to one column
stations['lonlat'] = zip(stations.lon, stations.lat)

In [None]:
#From Dr Kashuk
# Create Point Geometry for based on lonlat column
stations['geometry'] = stations[['lonlat']].applymap(lambda x: shp.geometry.Point(x))
stations.head()

Remove the uneeded columns to make the processing faster

In [None]:
stations = stations[['start_station', 'rides', 'geometry']]

In [None]:
stations.head()

Loading Census Tract Shape File

From Dr. Kashuk's Lab

In [None]:
shape_path = (os.getenv("PUIDATA") + "/" + "nycb2010_16c/nycb2010.shp")
nyc_shape = gp.read_file(shape_path)


Set co-ordinate system (Dr. Kushak)

In [None]:

nyc_shape.crs = from_epsg(4326) # epsg=4326: lat/on | 26918: NAD83/UTM zone 18N | epsg=2263 is US feet
nyc_shape = nyc_shape.to_crs(epsg=2263)


In [None]:
nyc_shape.head(2)

Change to Manhatten Only

In [None]:
manhat_shape = nyc_shape[nyc_shape.BoroName == 'Manhattan']

In [None]:
manhat_shape.head(2)

Sync with LODES data

In [None]:
manhat_shape.BCTCB2010 = ('36061' + manhat_shape.BCTCB2010).astype(str)

In [None]:
manhat_shape.head(2)

In [None]:
dtype(manhat_shape['BCTCB2010'])

In [None]:
manhat_shape = [['BCTCB2010', 'BoroName', 'geometry']]

In [None]:
manhat_shape['BCTCB2010'] = manhat_shape['BCTCB2010'].astype(str)

Read In LODES Data

In [None]:
lodes = pd.read_csv(os.getenv("PUIDATA") + "/" + "ny_od_main_JT00_2014.csv")

In [None]:
lodes.head()

In [None]:
dtype(lodes['w_geocode'])

In [None]:

lodes['w_geocode'] = lodes['w_geocode'].astype(str)

In [None]:
lodes['h_geocode'] = lodes['h_geocode'].astype(str)


In [None]:
lodes = lodes[['w_geocode', 'h_geocode', 'S000']] 

Keep LODES with only geocodes in Manhattan

In [None]:
lodesMN = lodes[np.in1d(lodes.h_geocode, manhat_shape.BCTCB2010) & np.in1d(lodes.w_geocode, manhat_shape.BCTCB2010)]
lodesMN.index = range(len(lodes))
lodesMN.head()