In [1]:
import pandas as pd

rides = pd.read_parquet("../data/transformed/rides_2022_01.parquet")

In [2]:
rides.head()

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68


In [3]:
rides["pickup_hour"] = rides['pickup_datetime'].dt.floor('h')

In [4]:
rides.head()

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2022-01-01 00:35:40,142,2022-01-01
1,2022-01-01 00:33:43,236,2022-01-01
2,2022-01-01 00:53:21,166,2022-01-01
3,2022-01-01 00:25:21,114,2022-01-01
4,2022-01-01 00:36:48,68,2022-01-01


In [5]:
app_rides = rides.groupby(["pickup_hour", "pickup_location_id"]).size().reset_index()
app_rides.rename(columns = {0: 'rides'}, inplace = True)

In [6]:
app_rides.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01,4,11
1,2022-01-01,7,6
2,2022-01-01,10,1
3,2022-01-01,12,2
4,2022-01-01,13,12


In [7]:
unique_days = app_rides["pickup_hour"].dt.floor('D').unique()

# Generate all hours of the day for each unique date
full_range = pd.concat([
    pd.DataFrame({"pickup_hour": pd.date_range(start=day, end=day + pd.Timedelta(days=1), freq="H")[:-1]})
    for day in unique_days
], ignore_index=True)

  pd.DataFrame({"pickup_hour": pd.date_range(start=day, end=day + pd.Timedelta(days=1), freq="H")[:-1]})


In [8]:
full_range

Unnamed: 0,pickup_hour
0,2022-01-01 00:00:00
1,2022-01-01 01:00:00
2,2022-01-01 02:00:00
3,2022-01-01 03:00:00
4,2022-01-01 04:00:00
...,...
739,2022-01-31 19:00:00
740,2022-01-31 20:00:00
741,2022-01-31 21:00:00
742,2022-01-31 22:00:00


In [9]:
from tqdm import tqdm

def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:

    location_ids = agg_rides["pickup_location_id"].unique()
    unique_days = agg_rides["pickup_hour"].dt.floor('D').unique()

# Generate all hours of the day for each unique date
    full_range = pd.concat([
        pd.DataFrame({"pickup_hour": pd.date_range(start=day, end=day + pd.Timedelta(days=1), freq="H")[:-1]})
        for day in unique_days
    ], ignore_index=True)
    
    output = pd.DataFrame()

    for location_id in tqdm(location_ids):
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location_id, ['pickup_hour', 'rides']]

        agg_rides_i.set_index("pickup_hour", inplace=True)
        agg_rides_i.index = pd.to_datetime(agg_rides_i.index)

        agg_rides_i = agg_rides_i.reindex(full_range['pickup_hour'], method="ffill")

        agg_rides_i["pickup_location_id"] = location_id
        output = pd.concat([output, agg_rides_i])

    output = output.reset_index().rename(columns={"index": "pickup_hour"})
    output['rides'] = output['rides'].fillna(0)

    return output

In [10]:
agg_rides_all_slots = add_missing_slots(app_rides)

  pd.DataFrame({"pickup_hour": pd.date_range(start=day, end=day + pd.Timedelta(days=1), freq="H")[:-1]})
  0%|          | 0/257 [00:00<?, ?it/s]

100%|██████████| 257/257 [00:00<00:00, 351.57it/s]


In [11]:
from typing import Optional, List
import plotly.express as px

def plot_rides(
        rides: pd.DataFrame,
        locations: Optional[List[int]] = None
):
    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color="pickup_location_id"
    )

    # fig.show(
    #     rides_to_plot,
    #     x="pickup_hour",
    #     y="rides",
    #     color="pickup_location_id",
    #     template = 'none'
    # )

    fig.show()

In [13]:
plot_rides(agg_rides_all_slots,locations = [100])