In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
df = pd.read_parquet(r'C:\Data\Citibike_NY_2022\merged\df_weather_duration.parquet')

# Creating station summary df to plot points 

In [3]:
# Count outbound (departures)
trips_from = df.groupby('start_station_name').size().reset_index(name='trips_from')

# Count inbound (arrivals)
trips_to = df.groupby('end_station_name').size().reset_index(name='trips_to')

# Merge into a station-level summary
station_summary = pd.merge(
    trips_from, 
    trips_to, 
    left_on='start_station_name', 
    right_on='end_station_name', 
    how='outer'
)

In [4]:
station_summary.head()

Unnamed: 0,start_station_name,trips_from,end_station_name,trips_to
0,1 Ave & E 110 St,21478.0,1 Ave & E 110 St,21621
1,1 Ave & E 16 St,67235.0,1 Ave & E 16 St,66392
2,1 Ave & E 18 St,70731.0,1 Ave & E 18 St,70483
3,1 Ave & E 30 St,45340.0,1 Ave & E 30 St,45993
4,1 Ave & E 39 St,52287.0,1 Ave & E 39 St,52693


In [5]:
station_summary.shape

(1818, 4)

In [6]:
# Making fresh name col (and if there's no starting value, taking the end)
station_summary['station_name'] = station_summary['start_station_name'].combine_first(station_summary['end_station_name'])

In [7]:
# dropping excess cols and putting name first
station_summary = station_summary[['station_name', 'trips_from', 'trips_to']]

In [8]:
station_summary.shape

(1818, 3)

In [9]:
# Filling NAs in case any station has only in or outbound trips
station_summary[['trips_from', 'trips_to']] = station_summary[['trips_from', 'trips_to']].fillna(0).astype(int)

In [10]:
# Extracting coordinates from individual trip df
station_coords = pd.concat([
    df[['start_station_name', 'start_lat', 'start_lng']].rename(
        columns={'start_station_name': 'station_name', 'start_lat': 'lat', 'start_lng': 'lng'}
    ),
    df[['end_station_name', 'end_lat', 'end_lng']].rename(
        columns={'end_station_name': 'station_name', 'end_lat': 'lat', 'end_lng': 'lng'}
    )
])

In [11]:
# Getting most commong coords per station 
    # I already made uniform for start and end in main cleaning notebook, this step makes sure coords are uniform across start and end
station_coords_clean = (
    station_coords
    .groupby('station_name')[['lat', 'lng']]
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0])
    .reset_index()
)

In [12]:
# Meergin coordinates onto summary stats
station_summary = station_summary.merge(
    station_coords_clean,
    on='station_name',
    how='left'
)

In [13]:
station_summary.head()

Unnamed: 0,station_name,trips_from,trips_to,lat,lng
0,1 Ave & E 110 St,21478,21621,40.792327,-73.9383
1,1 Ave & E 16 St,67235,66392,40.732219,-73.981656
2,1 Ave & E 18 St,70731,70483,40.733812,-73.980544
3,1 Ave & E 30 St,45340,45993,40.741444,-73.975361
4,1 Ave & E 39 St,52287,52693,40.74714,-73.97113


In [14]:
station_summary.shape

(1818, 5)

Calculating how much a station is a sender or receiver station, by calculating the difference between inbound and outbound bikes as % of outbound trips

In [15]:
# Compute no-return percentage
station_summary['no_return_pc'] = (
    (station_summary['trips_from'] - station_summary['trips_to'])*100
    ) / station_summary['trips_from'].replace(0, np.nan)    # replacing 0 with missing to avoid errors

In [16]:
station_summary['no_return_pc'] = station_summary['no_return_pc'].round(1)
station_summary.head()

Unnamed: 0,station_name,trips_from,trips_to,lat,lng,no_return_pc
0,1 Ave & E 110 St,21478,21621,40.792327,-73.9383,-0.7
1,1 Ave & E 16 St,67235,66392,40.732219,-73.981656,1.3
2,1 Ave & E 18 St,70731,70483,40.733812,-73.980544,0.4
3,1 Ave & E 30 St,45340,45993,40.741444,-73.975361,-1.4
4,1 Ave & E 39 St,52287,52693,40.74714,-73.97113,-0.8


In [17]:
station_summary.describe()

Unnamed: 0,trips_from,trips_to,lat,lng,no_return_pc
count,1818.0,1818.0,1818.0,1818.0,1739.0
mean,16373.99615,16373.99615,40.748353,-73.956375,0.017021
std,19655.406382,19677.423804,0.059887,0.039713,6.635691
min,0.0,1.0,40.633385,-74.086701,-113.8
25%,2535.25,2497.75,40.701065,-73.987741,-1.4
50%,8156.0,8149.0,40.742777,-73.953245,-0.3
75%,24343.75,24507.0,40.794882,-73.922651,1.1
max,128822.0,130178.0,40.88226,-73.88145,50.5


Creating cols to express trips as daily figures for better interpretation

In [18]:
station_summary['daily_deps'] = (station_summary['trips_from']/365).round(1)
station_summary['daily_arrs'] = (station_summary['trips_to']/365).round(1)

station_summary = station_summary.drop(columns=['trips_from', 'trips_to'])  # dropping totals

In [19]:
station_summary.describe()

Unnamed: 0,lat,lng,no_return_pc,daily_deps,daily_arrs
count,1818.0,1818.0,1739.0,1818.0,1818.0
mean,40.748353,-73.956375,0.017021,44.859461,44.860286
std,0.059887,0.039713,6.635691,53.850202,53.911984
min,40.633385,-74.086701,-113.8,0.0,0.0
25%,40.701065,-73.987741,-1.4,6.925,6.825
50%,40.742777,-73.953245,-0.3,22.35,22.3
75%,40.794882,-73.922651,1.1,66.7,67.15
max,40.88226,-73.88145,50.5,352.9,356.7


In [20]:
station_summary.head()

Unnamed: 0,station_name,lat,lng,no_return_pc,daily_deps,daily_arrs
0,1 Ave & E 110 St,40.792327,-73.9383,-0.7,58.8,59.2
1,1 Ave & E 16 St,40.732219,-73.981656,1.3,184.2,181.9
2,1 Ave & E 18 St,40.733812,-73.980544,0.4,193.8,193.1
3,1 Ave & E 30 St,40.741444,-73.975361,-1.4,124.2,126.0
4,1 Ave & E 39 St,40.74714,-73.97113,-0.8,143.3,144.4


In [21]:
# Exporting
station_summary.to_csv(r'C:\Data\Citibike_NY_2022\merged\station_summary.csv',
                       index=False)