In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

  from pkg_resources import resource_string


In [2]:
# import data
df = pd.read_parquet(r'C:\Data\Citibike_NY_2022\merged\df_weather_duration.parquet')

In [3]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,date,AWND,PRCP,TAVG,start_lat,start_lng,end_lat,end_lng,trip_duration
0,FB33E3D8F21E2941,electric_bike,2022-01-01 01:49:37.374,2022-01-01 01:57:50.346,Canal St & Rutgers St,5303.08,Norfolk St & Broome St,5374.01,casual,2022-01-01,28,193,116,40.714275,-73.9899,40.717227,-73.988021,8.2162
1,755337295F178067,electric_bike,2022-01-01 03:21:09.754,2022-01-01 03:49:33.047,Lewis Ave & Madison St,4425.02,Columbia St & Degraw St,4422.04,member,2022-01-01,28,193,116,40.686312,-73.935775,40.68593,-74.002424,28.388217
2,C62CA87E3A475ADD,classic_bike,2022-01-01 08:38:18.156,2022-01-01 08:47:54.213,Carlton Ave & Park Ave,4732.04,Emerson Pl & Myrtle Ave,4683.02,casual,2022-01-01,28,193,116,40.695807,-73.973556,40.693631,-73.962236,9.60095
3,CD7A2098AFCD5514,classic_bike,2022-01-01 10:33:58.529,2022-01-01 10:47:05.197,W 100 St & Broadway,7580.01,W 67 St & Broadway,7116.04,casual,2022-01-01,28,193,116,40.797372,-73.970412,40.774925,-73.982666,13.111133
4,3F9E0C51F49F78A3,electric_bike,2022-01-01 20:05:19.592,2022-01-01 20:12:00.661,W 18 St & 6 Ave,6064.08,W 42 St & 6 Ave,6517.08,member,2022-01-01,28,193,116,40.739713,-73.994564,40.75492,-73.98455,6.684483


In [6]:
# getting top 20 stations 
top_20 = ( 
    df.groupby('start_station_name')
    .size()
    .reset_index(name='trip_count')
    .sort_values(by='trip_count', ascending=False)
    .head(20)
    .copy()
)

In [7]:
top_20.head()

Unnamed: 0,start_station_name,trip_count
1587,W 21 St & 6 Ave,128477
1718,West St & Chambers St,122218
495,Broadway & W 58 St,112673
286,6 Ave & W 33 St,105664
8,1 Ave & E 68 St,104370


In [10]:
fig = go.Figure(go.Bar(x=top_20['start_station_name'], y=top_20['trip_count']))
fig.show()

In [11]:
fig.update_layout(
    title = 'Top 20 most popular bike stations in New York City (2022)',
    xaxis_title = 'Start stations',
    yaxis_title ='Total trips started',
    width = 900, height = 600
)

In [3]:
# Creating grouped df where each row is a day with number of trips aggregated - avoids millions of rows unnecessarily
    # Keeping all three weather variables and renaming them
df_weather = ( df.groupby('date')
      .agg(
          trip_count = ('date', 'size'),
          temperature = ('TAVG', 'first'),
          precipitation = ('PRCP', 'first'),
          wind = ('AWND', 'first')
      )
          .reset_index()
          .copy()
)

In [4]:
# setting date as datetime
df_weather['date'] = pd.to_datetime(df_weather['date'])

In [None]:
# Careful with this, the SQL file already saves as degrees
# df_weather['temperature'] = df_weather['temperature']/10 # changing temperature to °C


In [6]:
df_weather.head()

Unnamed: 0,date,trip_count,temperature,precipitation,wind
0,2022-01-01,20198,11.6,193,28
1,2022-01-02,42203,11.4,10,43
2,2022-01-03,32453,1.4,0,64
3,2022-01-04,35869,-2.7,0,39
4,2022-01-05,33689,3.2,61,34


In [30]:
# Making dual axis line plot with temperature and total trips
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=df_weather['date'], y=df_weather['temperature'],name = 'daily temperature'),
    secondary_y=False
)

fig.add_trace(
    go.Scatter(x=df_weather['date'], y=df_weather['trip_count'],name='daily bike rides'),
    secondary_y=True
)

# Add figure title
fig.update_layout(
    title_text="Line Plot of Daily Citibike Trips and Temperature - New York 2022"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="Temperature (Daily Average °C)", secondary_y=False)
fig.update_yaxes(title_text="Number of Citibike Trips", secondary_y=True)

fig.show()

In [9]:
###### Dual axis line plot with percipitation and total trips

# Create figure with secondary y-axis
fig_3 = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig_3.add_trace(
    go.Scatter(x=df_weather['date'], 
               y=df_weather['precipitation'],
               name = 'daily precipitation',
               line=dict(color='blue')),
    secondary_y=False
)
fig_3.update_yaxes(autorange='reversed', secondary_y=False)

fig_3.add_trace(
    go.Scatter(x=df_weather['date'], 
               y=df_weather['trip_count'],
               name='daily bike rides',
               line=dict(color='red')),
    secondary_y=True
)

# Add figure title
fig_3.update_layout(
    title_text="Line Plot of Daily Citibike Trips and Precipitation - New York 2022"
)

# Set x-axis title
fig_3.update_xaxes(title_text="Date")

# Set y-axes titles
fig_3.update_yaxes(title_text="Precipitation (mm)", secondary_y=False)
fig_3.update_yaxes(title_text="Number of Citibike Trips", secondary_y=True)

fig_3.show()
# to get it into streamlit st.plotly_chart(fig_3, use_container_width=True)


In [14]:
df_dow = df_weather[['date', 'trip_count']].copy()
df_dow['day_of_week'] = df_dow['date'].dt.day_name()
df_dow.head()

Unnamed: 0,date,trip_count,day_of_week
0,2022-01-01,20198,Saturday
1,2022-01-02,42203,Sunday
2,2022-01-03,32453,Monday
3,2022-01-04,35869,Tuesday
4,2022-01-05,33689,Wednesday


In [15]:
# Determine order of days (it was starting with Satruday)
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Get average daily trips to plot
avg_trips = df_dow.groupby('day_of_week')['trip_count'].mean().reset_index()

avg_trips['day_of_week'] = pd.Categorical(avg_trips['day_of_week'], categories=dow_order, ordered=True)
avg_trips = avg_trips.sort_values('day_of_week')

In [19]:
# Plot
fig_dow = go.Figure(go.Bar(x=avg_trips['day_of_week'], y=avg_trips['trip_count']))
# Add figure title
fig_dow.update_layout(
    title_text="Bar Chart of Average Daily Trips by Day of the Week",
    yaxis_range=[0, 100000]  
)
fig_dow.show()


In [4]:
# Dropping outlier trips over 100 mins
df = df[df['trip_duration'] < 100].copy()

In [5]:
df_sample = df.sample(100_000, random_state=1)

In [6]:
# Define color mapping
custom_colors = {
    'member': 'blue',
    'casual': 'orange'
}

fig = px.box(
    df_sample,
    x='rideable_type',
    y='trip_duration',
    color='member_casual',
    color_discrete_map=custom_colors,
    points='outliers', 
    title='Trip Duration by Ride Type and Membership',
    hover_data=[],
    labels={
        "rideable_type": "Bike Type",
        "trip_duration": "Trip Duration (mins)",
        "member_casual": "Membership Type"
    }
)
fig.update_traces(hoverinfo='skip', hovertemplate=None)
fig.show()

In [9]:
station_summary = pd.read_csv(r'C:\Data\Citibike_NY_2022\merged\station_summary.csv')

In [10]:
station_summary.head()

Unnamed: 0,station_name,lat,lng,no_return_pc,daily_deps,daily_arrs
0,1 Ave & E 110 St,40.792327,-73.9383,-0.665798,58.8,59.2
1,1 Ave & E 16 St,40.732219,-73.981656,1.253811,184.2,181.9
2,1 Ave & E 18 St,40.733812,-73.980544,0.350624,193.8,193.1
3,1 Ave & E 30 St,40.741444,-73.975361,-1.440229,124.2,126.0
4,1 Ave & E 39 St,40.74714,-73.97113,-0.776484,143.3,144.4


In [13]:
# Checking % of stations where they're net bike levels are within than 10% (i.e., receiving or losing)
percentage = ((station_summary['no_return_pc'].between(-10, 10)).mean()) * 100
print(percentage)

88.998899889989


In [18]:
fig = px.histogram(station_summary,
                   x = 'no_return_pc',
                    range_x=[-40, 40],
                    title='Difference in Bike Returns vs Departures')
fig.update_yaxes(
        title_text = "Station Count",
        title_standoff = 25)

fig.update_xaxes(
        title_text = "Net-Loss of bikes in %",
        title_standoff = 25)

fig.show()