# Script that creates visualisation of top 1000 Citibike routes in 2022, saves config and visualisation for dashboard

In [1]:
import pandas as pd
import numpy as np
from keplergl import KeplerGl
from pyproj import CRS
from matplotlib import pyplot as plt
import os

  from pkg_resources import resource_string


In [2]:
df = pd.read_parquet(r'C:\Data\Citibike_NY_2022\merged\df_weather_duration.parquet')

In [3]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,date,AWND,PRCP,TAVG,start_lat,start_lng,end_lat,end_lng,trip_duration
0,FB33E3D8F21E2941,electric_bike,2022-01-01 01:49:37.374,2022-01-01 01:57:50.346,Canal St & Rutgers St,5303.08,Norfolk St & Broome St,5374.01,casual,2022-01-01,28,193,116,40.714275,-73.9899,40.717227,-73.988021,8.2162
1,755337295F178067,electric_bike,2022-01-01 03:21:09.754,2022-01-01 03:49:33.047,Lewis Ave & Madison St,4425.02,Columbia St & Degraw St,4422.04,member,2022-01-01,28,193,116,40.686312,-73.935775,40.68593,-74.002424,28.388217
2,C62CA87E3A475ADD,classic_bike,2022-01-01 08:38:18.156,2022-01-01 08:47:54.213,Carlton Ave & Park Ave,4732.04,Emerson Pl & Myrtle Ave,4683.02,casual,2022-01-01,28,193,116,40.695807,-73.973556,40.693631,-73.962236,9.60095
3,CD7A2098AFCD5514,classic_bike,2022-01-01 10:33:58.529,2022-01-01 10:47:05.197,W 100 St & Broadway,7580.01,W 67 St & Broadway,7116.04,casual,2022-01-01,28,193,116,40.797372,-73.970412,40.774925,-73.982666,13.111133
4,3F9E0C51F49F78A3,electric_bike,2022-01-01 20:05:19.592,2022-01-01 20:12:00.661,W 18 St & 6 Ave,6064.08,W 42 St & 6 Ave,6517.08,member,2022-01-01,28,193,116,40.739713,-73.994564,40.75492,-73.98455,6.684483


In [4]:
# creating aggregated df with # of trips between stations
# keeping variables needed for graphs. Taking most common values of coordinates in case they differ anywhere
df_trips = (
    df.groupby(['start_station_name', 'end_station_name'])
      .agg({
          'ride_id': 'count',
          'start_lat': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'start_lng': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'end_lat': lambda x: x.mode().iloc[0] if not x.mode().empty else None,
          'end_lng': lambda x: x.mode().iloc[0] if not x.mode().empty else None})
      .reset_index()
      .rename(columns={'ride_id': 'num_trips'})
      .sort_values(by=['num_trips'],
                   ascending=False)
      .copy()
)

In [5]:
df_trips.head(20)

Unnamed: 0,start_station_name,end_station_name,num_trips,start_lat,start_lng,end_lat,end_lng
292588,Central Park S & 6 Ave,Central Park S & 6 Ave,12041,40.765909,-73.976342,40.765909,-73.976342
147007,7 Ave & Central Park South,7 Ave & Central Park South,8541,40.766741,-73.979069,40.766741,-73.979069
777093,Roosevelt Island Tramway,Roosevelt Island Tramway,8213,40.757284,-73.9536,40.757284,-73.9536
544121,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287,40.764397,-73.973715,40.764397,-73.973715
795202,Soissons Landing,Soissons Landing,7275,40.692317,-74.014866,40.692317,-74.014866
891295,W 21 St & 6 Ave,9 Ave & W 22 St,6345,40.74174,-73.994156,40.745497,-74.001971
118879,5 Ave & E 72 St,5 Ave & E 72 St,6037,40.772828,-73.966853,40.772828,-73.966853
6514,1 Ave & E 62 St,1 Ave & E 68 St,5826,40.761227,-73.96094,40.765005,-73.958185
1006565,Yankee Ferry Terminal,Yankee Ferry Terminal,5759,40.687066,-74.016756,40.687066,-74.016756
253043,Broadway & W 58 St,Broadway & W 58 St,5509,40.766953,-73.981693,40.766953,-73.981693


14/20 of the top routes start and end at same station

In [6]:
df_trips.shape

(1006566, 7)

In [7]:
# check number of trips
print(df_trips['num_trips'].sum())
print(len(df))

29767925
29767925


Aggrgation worked properly

In [8]:
# Checking what percentage of trips start and ended at same station
# Total number of trips
total_trips = df_trips['num_trips'].sum()

# Trips where start and end station are the same
round_trips = df_trips[df_trips['start_station_name'] == df_trips['end_station_name']]['num_trips'].sum()

# Percentage
round_trip_percent = (round_trips / total_trips) * 100

print(f"Percentage of round trips: {round_trip_percent:.2f}%")

Percentage of round trips: 3.28%


In [9]:
# Limit df to top 1000 routes to vizualise
top_1000 = df_trips.sort_values('num_trips', ascending=False).head(1000).copy()

In [10]:
m = KeplerGl(height = 700, data={"data_1": top_1000})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data_1':                         start_station_name                   end_station_name  \
2925…

In [11]:
# save settings
config = m.config

In [12]:
import json
with open("config_routes.json", "w") as outfile:
    json.dump(config, outfile)

In [13]:
m.save_to_html(file_name = '../visualisations/routes.html', 
               read_only = False, 
               config = config)

Map saved to ../visualisations/routes.html!
