In [4]:
import pandas as pd
import numpy as np
from keplergl import KeplerGl
from pyproj import CRS
from matplotlib import pyplot as plt
import os
import geopandas as gpd

  from pkg_resources import resource_string


In [7]:
df = pd.read_parquet(r'C:\Data\Citibike_NY_2022\merged\df_weather_duration.parquet')

In [8]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,date,AWND,PRCP,TAVG,trip_duration
0,FB33E3D8F21E2941,electric_bike,2022-01-01 01:49:37.374,2022-01-01 01:57:50.346,Canal St & Rutgers St,Norfolk St & Broome St,40.714275,-73.9899,40.717227,-73.988021,casual,2022-01-01,28,193,116,8.2162
1,755337295F178067,electric_bike,2022-01-01 03:21:09.754,2022-01-01 03:49:33.047,Lewis Ave & Madison St,Columbia St & Degraw St,40.686312,-73.935775,40.68593,-74.002424,member,2022-01-01,28,193,116,28.388217
2,C62CA87E3A475ADD,classic_bike,2022-01-01 08:38:18.156,2022-01-01 08:47:54.213,Carlton Ave & Park Ave,Emerson Pl & Myrtle Ave,40.695807,-73.973556,40.693631,-73.962236,casual,2022-01-01,28,193,116,9.60095
3,CD7A2098AFCD5514,classic_bike,2022-01-01 10:33:58.529,2022-01-01 10:47:05.197,W 100 St & Broadway,W 67 St & Broadway,40.797372,-73.970412,40.774925,-73.982666,casual,2022-01-01,28,193,116,13.111133
4,3F9E0C51F49F78A3,electric_bike,2022-01-01 20:05:19.592,2022-01-01 20:12:00.661,W 18 St & 6 Ave,W 42 St & 6 Ave,40.739713,-73.994564,40.75492,-73.98455,member,2022-01-01,28,193,116,6.684483


In [9]:
df.shape

(29767925, 16)

In [10]:
# Creating column with number of trips from and to, before aggregating to paired trips df
df['trips_from'] = (
    df.groupby('start_station_name')['start_station_name']
    .transform('count')
    )

df['trips_to'] = (
    df.groupby('end_station_name')['end_station_name']
    .transform('count')
    )

df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,date,AWND,PRCP,TAVG,trip_duration,trips_from,trips_to
0,FB33E3D8F21E2941,electric_bike,2022-01-01 01:49:37.374,2022-01-01 01:57:50.346,Canal St & Rutgers St,Norfolk St & Broome St,40.714275,-73.9899,40.717227,-73.988021,casual,2022-01-01,28,193,116,8.2162,64510,71018
1,755337295F178067,electric_bike,2022-01-01 03:21:09.754,2022-01-01 03:49:33.047,Lewis Ave & Madison St,Columbia St & Degraw St,40.686312,-73.935775,40.68593,-74.002424,member,2022-01-01,28,193,116,28.388217,19047,11641
2,C62CA87E3A475ADD,classic_bike,2022-01-01 08:38:18.156,2022-01-01 08:47:54.213,Carlton Ave & Park Ave,Emerson Pl & Myrtle Ave,40.695807,-73.973556,40.693631,-73.962236,casual,2022-01-01,28,193,116,9.60095,7862,28761
3,CD7A2098AFCD5514,classic_bike,2022-01-01 10:33:58.529,2022-01-01 10:47:05.197,W 100 St & Broadway,W 67 St & Broadway,40.797372,-73.970412,40.774925,-73.982666,casual,2022-01-01,28,193,116,13.111133,38595,50030
4,3F9E0C51F49F78A3,electric_bike,2022-01-01 20:05:19.592,2022-01-01 20:12:00.661,W 18 St & 6 Ave,W 42 St & 6 Ave,40.739713,-73.994564,40.75492,-73.98455,member,2022-01-01,28,193,116,6.684483,67638,67407


In [11]:
df[['start_station_name', 'trips_from', 'end_station_name', 'trips_to']].head()

Unnamed: 0,start_station_name,trips_from,end_station_name,trips_to
0,Canal St & Rutgers St,64510,Norfolk St & Broome St,71018
1,Lewis Ave & Madison St,19047,Columbia St & Degraw St,11641
2,Carlton Ave & Park Ave,7862,Emerson Pl & Myrtle Ave,28761
3,W 100 St & Broadway,38595,W 67 St & Broadway,50030
4,W 18 St & 6 Ave,67638,W 42 St & 6 Ave,67407


In [12]:
# Check max values
df[['trips_from', 'trips_to']].max()

trips_from    128822
trips_to      130178
dtype: int64

In [13]:
# creating aggregated df with # of trips between stations
# keeping variables needed for graphs. Taking most common values of coordinates since they slightly differ sometimes
df_trips = (
    df.groupby(['start_station_name', 'end_station_name'])
      .agg({
          'ride_id': 'count',
          'start_lat': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'start_lng': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'end_lat': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'end_lng': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'trips_to':'first',
          'trips_from':'first'})
      .reset_index()
      .rename(columns={'ride_id': 'num_trips'})
      .sort_values(by=['num_trips'],
                   ascending=False)
      .copy()
)

In [14]:
df_trips.head(20)

Unnamed: 0,start_station_name,end_station_name,num_trips,start_lat,start_lng,end_lat,end_lng,trips_to,trips_from
294967,Central Park S & 6 Ave,Central Park S & 6 Ave,12041,40.765909,-73.976342,40.765909,-73.976342,87226,86777
147753,7 Ave & Central Park South,7 Ave & Central Park South,8541,40.766741,-73.979069,40.766741,-73.979069,74814,75212
782275,Roosevelt Island Tramway,Roosevelt Island Tramway,8213,40.757284,-73.9536,40.757284,-73.9536,20433,19902
548176,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287,40.764397,-73.973715,40.764397,-73.973715,80811,80785
800488,Soissons Landing,Soissons Landing,7275,40.692317,-74.014866,40.692317,-74.014866,15143,14937
896925,W 21 St & 6 Ave,9 Ave & W 22 St,6345,40.74174,-73.994156,40.745497,-74.001971,84105,128822
119316,5 Ave & E 72 St,5 Ave & E 72 St,6037,40.772828,-73.966853,40.772828,-73.966853,69631,69102
6539,1 Ave & E 62 St,1 Ave & E 68 St,5826,40.761227,-73.96094,40.765005,-73.958185,105121,68106
1013378,Yankee Ferry Terminal,Yankee Ferry Terminal,5759,40.687066,-74.016756,40.687066,-74.016756,12867,12840
255081,Broadway & W 58 St,Broadway & W 58 St,5509,40.766953,-73.981693,40.766953,-73.981693,110312,114040


One noticable thing is the top 5 routes all start and end at the same station

In [15]:
# check number of trips
print(df_trips['num_trips'].sum())

29767925


Same number of trips as pre-aggregation, so everything worked

## Plotting with KeplerGl

In [16]:
df_trips.dtypes

start_station_name     object
end_station_name       object
num_trips               int64
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
trips_to                int64
trips_from              int64
dtype: object

In [17]:
print(os.getcwd())

c:\Users\seank\OneDrive\Dokumente\Career Foundry Data Analytics Course\Python_visualisation\CitiBike_NY\notebooks


In [18]:
# load the subway lines GeoJSON 
subway_lines = gpd.read_file("../subway_lines.geojson")

In [19]:
# load subway stations GeoJson
subway_stations = gpd.read_file("../subway_stations.geojson")

In [26]:
# load NTA shapes with population and income data merged
ntas = gpd.read_file("C:/Data/Citibike_NY_2022/merged/nta_pop_inc.geojson")
ntas.dtypes

shape_area            object
ntaname               object
cdtaname              object
shape_leng            object
boroname              object
ntatype               object
nta2020               object
borocode              object
countyfips            object
ntaabbrev             object
cdta2020              object
GeoID                 object
median_hh_income     float64
population           float64
area_km2             float64
pop_density          float64
geometry            geometry
dtype: object

In [27]:
# Create KeplerGl instance

m = KeplerGl(height = 700, data={"data_1": df_trips})
m.add_data(subway_lines, name="Subway Lines")
m.add_data(subway_stations, name="Subway Stations")
m.add_data(ntas, name="Income and Population by NTA")
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data_1':                         start_station_name                   end_station_name  \
2949…

In [28]:
# save settings
config = m.config

In [29]:
import json
with open("config_layers.json", "w") as outfile:
    json.dump(config, outfile)

In [30]:
m.save_to_html(file_name = '../visualisations/trips_layers_pop.html', 
               read_only = False, 
               config = config)

Map saved to ../visualisations/trips_layers_pop.html!
