# 2.6: Creating Dashboards with Python

In [8]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [12]:
df = pd.read_csv('nycbike.csv', index_col = 0, low_memory=False)

In [13]:
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
PRCP                  float64
TAVG                  float64
tripduration_min      float64
dtype: object

In [14]:
dtype_fixes = {
    "ride_id": "string",                # unique IDs
    "rideable_type": "category",        # limited categories
    "started_at": "datetime64[ns]",
    "ended_at": "datetime64[ns]",
    "start_station_name": "category",
    "start_station_id": "category",
    "end_station_name": "category",
    "end_station_id": "category",
    "start_lat": "float64",             # geo-coords must be float64
    "start_lng": "float64",
    "end_lat": "float64",
    "end_lng": "float64",
    "member_casual": "category",
    "PRCP": "float32",                  # weather
    "TAVG": "float32",
    "tripduration_min": "float32"       # derived value
}

In [15]:
df = df.astype(dtype_fixes)

In [16]:
df.dtypes

ride_id               string[python]
rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name          category
start_station_id            category
end_station_name            category
end_station_id              category
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual               category
PRCP                         float32
TAVG                         float32
tripduration_min             float32
dtype: object

In [17]:
# Add a date-only column
df['date'] = df['started_at'].dt.date

In [18]:
df.shape

(29697182, 17)

In [19]:
df.head()

Unnamed: 0_level_0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,PRCP,TAVG,tripduration_min,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-01-21,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802116,-73.96818,40.80404,-73.94592,member,0.0,21.0,8.801184,2022-01-21
2022-01-10,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673744,-73.98565,40.688488,-73.99116,member,0.0,35.0,10.821,2022-01-10
2022-01-26,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745167,-73.98683,member,0.0,28.0,13.86885,2022-01-26
2022-01-03,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783962,-73.94717,40.745167,-73.98683,member,0.0,35.0,35.037132,2022-01-03
2022-01-22,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745167,-73.98683,member,0.0,21.0,20.573851,2022-01-22


In [20]:
df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'PRCP', 'TAVG', 'tripduration_min', 'date'],
      dtype='object')

In [21]:
# Create a month column 

df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df['month'] = df['date'].dt.month
df['month'] = df['month'].astype('int')

In [22]:
# Create the season column

df['season'] = [
"winter" if (month == 12 or 1 <= month <= 4)
    else "spring" if (4 < month <= 5)
    else "summer" if (6 <= month <= 9)
    else "fall"
for month in df['month']
    ]

In [23]:
df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'PRCP', 'TAVG', 'tripduration_min', 'date', 'month',
       'season'],
      dtype='object')

In [39]:
import plotly.io as pio
pio.renderers.default = "notebook"   # or "notebook_connected"
# Alternative for JupyterLab:
# pio.renderers.default = "jupyterlab"


## 3.	Use plotly to produce a bar chart for the most popular stations in New York. Consider the chart layout and use what you’ve learned to customize its design.

In [47]:
## Groupby

df['value'] = 1
df_groupby_bar = (
    df.groupby('start_station_name', as_index=False, observed=True)
      .agg({'value': 'sum'})
)
top20 = df_groupby_bar.nlargest(20, 'value')

In [48]:
fig_popularstation = go.Figure(go.Bar(x = top20['start_station_name'], y = top20['value']))
fig_popularstation.show()

In [49]:
fig_popularstation1 = go.Figure(go.Bar(x = top20['start_station_name'], y = top20['value'], marker={'color': top20['value'],'colorscale': 'Blues'}))
fig_popularstation1.show()

In [51]:
## Bar chart

fig_popularstation1.update_layout(
    title = 'Top 20 most popular bike stations in NYC',
    xaxis_title = 'Start stations',
    yaxis_title ='Sum of trips',
    width = 900, height = 600
)

## 4.	Create a dual-axis line chart for the aggregated bike trips and temperatures in plotly.

In [66]:
# Aggregate daily trip counts and average temperature
daily_trips = (
    df.groupby("date")
      .agg(trip_count=("ride_id", "count"),
           TAVG=("TAVG", "mean"))
      .reset_index()
)

In [69]:
fig_dailytrips = px.line(daily_trips, x="date", y="trip_count",
              title="Daily Trips Over Time")
fig_dailytrips.show()


In [70]:

fig_dailytrips_temp = make_subplots(specs=[[{"secondary_y": True}]])

# Daily trips line
fig_dailytrips_temp.add_trace(
    go.Scatter(x=daily_trips["date"], y=daily_trips["trip_count"],
               mode="lines", name="Trips"),
    secondary_y=False
)

# Daily avg temp line
fig_dailytrips_temp.add_trace(
    go.Scatter(x=daily_trips["date"], y=daily_trips["TAVG"],
               mode="lines", name="Avg Temp"),
    secondary_y=True
)

# Layout
fig_dailytrips_temp.update_layout(
    title="Daily Trips vs Temperature",
    yaxis=dict(title="Trips"),
    yaxis2=dict(title="Avg Temp (°F)", overlaying="y", side="right")
)

fig_dailytrips_temp.show()


In [71]:
daily_trips.to_parquet(r"C:\Users\valev\CityBike\daily_trips.parquet", index=False)

## 5.	Create a “.py” file for your Streamlit application.

In [55]:
import gc # this is a garbage collector
gc.collect()

3569

In [58]:
# Save the top 20 stations as a csv file 

top20.to_csv(r"C:\Users\valev\CityBike\top20stations.csv", index=False)

In [65]:
top20.to_parquet(r"C:\Users\valev\CityBike\top20stations.parquet", index=False)

In [59]:
df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'PRCP', 'TAVG', 'tripduration_min', 'date', 'month',
       'season', 'value'],
      dtype='object')

In [76]:
# Create a copy with fewer columns
df_1 = df[[
    'ride_id', 'started_at', 'ended_at', 'tripduration_min', 
    'start_station_name', 'end_station_name', 'member_casual', 'rideable_type',
    'date', 'month', 'season','TAVG' 
]]

df_1.to_parquet(r"C:\Users\valev\CityBike\reduced_data_to_plot.parquet", index=False)

In [77]:
# Create a random slip with 8%
small = df_1.sample(frac=0.08, random_state=32)  # ~8% of rows


In [78]:
len(small), len(df_1)

(2375775, 29697182)

In [79]:
small.to_parquet(r"C:\Users\valev\CityBike\bike_trips_small.parquet", index=False)

In [80]:
df_1.to_parquet(r"C:\Users\valev\CityBike\reduced_data_to_plot.parquet", index=False)

# To read later:
    import pandas as pd
    df = pd.read_parquet(r"C:\Users\valev\CityBike\bike_trips_small.parquet")
    df = pd.read_parquet(r"C:\Users\valev\CityBike\reduced_data_to_plot.parquet")