In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

  from pkg_resources import resource_string


In [2]:
# import data
df = pd.read_parquet(r'C:\Data\Citibike_NY_2022\merged\df_weather_duration.parquet')

In [6]:
# getting top 20 stations 
top_20 = ( 
    df.groupby('start_station_name')
    .size()
    .reset_index(name='trip_count')
    .sort_values(by='trip_count', ascending=False)
    .head(20)
    .copy()
)

In [7]:
top_20.head()

Unnamed: 0,start_station_name,trip_count
1587,W 21 St & 6 Ave,128477
1718,West St & Chambers St,122218
495,Broadway & W 58 St,112673
286,6 Ave & W 33 St,105664
8,1 Ave & E 68 St,104370


In [10]:
fig = go.Figure(go.Bar(x=top_20['start_station_name'], y=top_20['trip_count']))
fig.show()

In [11]:
fig.update_layout(
    title = 'Top 20 most popular bike stations in New York City (2022)',
    xaxis_title = 'Start stations',
    yaxis_title ='Total trips started',
    width = 900, height = 600
)

In [26]:
# Creating grouped df where each row is a day with number of trips aggregated - avoids millions of rows unnecessarily
    # Keeping all three weather variables and renaming them
df_weather = ( df.groupby('date')
      .agg(
          trip_count = ('date', 'size'),
          temperature = ('TAVG', 'first'),
          precipitation = ('PRCP', 'first'),
          wind = ('AWND', 'first')
      )
          .reset_index()
          .copy()
)

In [27]:
# setting date as datetime
df_weather['date'] = pd.to_datetime(df_weather['date'])

In [28]:
df_weather['temperature'] = df_weather['temperature']/10 # changing temperature to °C


In [29]:
df_weather.head()

Unnamed: 0,date,trip_count,temperature,precipitation,wind
0,2022-01-01,20050,11.6,193,28
1,2022-01-02,41927,11.4,10,43
2,2022-01-03,32378,1.4,0,64
3,2022-01-04,35802,-2.7,0,39
4,2022-01-05,33606,3.2,61,34


In [30]:
# Making dual axis line plot with temperature and total trips
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=df_weather['date'], y=df_weather['temperature'],name = 'daily temperature'),
    secondary_y=False
)

fig.add_trace(
    go.Scatter(x=df_weather['date'], y=df_weather['trip_count'],name='daily bike rides'),
    secondary_y=True
)

# Add figure title
fig.update_layout(
    title_text="Line Plot of Daily Citibike Trips and Temperature - New York 2022"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="Temperature (Daily Average °C)", secondary_y=False)
fig.update_yaxes(title_text="Number of Citibike Trips", secondary_y=True)

fig.show()