In [3]:
import pandas as pd

rides = pd.read_parquet("../data/transformed/validated_rides_2022_01.parquet") # reads in parquet-file in specified path and returns its data in a dataframe

rides.head(10)  
# each row in dataframe represents a ride

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
5,2022-01-01 00:40:15,138
6,2022-01-01 00:20:50,233
7,2022-01-01 00:13:04,238
8,2022-01-01 00:30:02,166
9,2022-01-01 00:48:52,236


In [6]:
# add column that is the rounded hour, because we want to work with time series data at an hourly frequency
# rides[new-col] = rides get the datetime columns and round the hour part of it. .dt acess the datetime properties of the pickup_datetime columns
rides["pickup_hour"] = rides["pickup_datetime"].dt.floor("H")
rides

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2022-01-01 00:35:40,142,2022-01-01 00:00:00
1,2022-01-01 00:33:43,236,2022-01-01 00:00:00
2,2022-01-01 00:53:21,166,2022-01-01 00:00:00
3,2022-01-01 00:25:21,114,2022-01-01 00:00:00
4,2022-01-01 00:36:48,68,2022-01-01 00:00:00
...,...,...,...
2463926,2022-01-31 23:36:53,90,2022-01-31 23:00:00
2463927,2022-01-31 23:44:22,107,2022-01-31 23:00:00
2463928,2022-01-31 23:39:00,113,2022-01-31 23:00:00
2463929,2022-01-31 23:36:42,148,2022-01-31 23:00:00


In [13]:
# group/count the number of rides per location_ID per pickup_hour
# groupby(): given the rides-dataframe, groups the 2 columns which means all the rows with the same pickup_hour and pickup_location_id will be grouped together
# size(): counts the number rows/rides ine ach group, this produces a series wher ehte index is the grouped columns, and the values are the counts of rides in each group, 
# reset-index(): converts the series back into a dataframe and resets the index
app_rides = rides.groupby(["pickup_hour","pickup_location_id"]).size().reset_index()
app_rides.rename(columns={0: "num_of_rides",}, inplace=True)  # rename the new column to the number of rides per location and hour
app_rides

Unnamed: 0,pickup_hour,pickup_location_id,num_of_rides
0,2022-01-01 00:00:00,4,11
1,2022-01-01 00:00:00,7,6
2,2022-01-01 00:00:00,10,1
3,2022-01-01 00:00:00,12,2
4,2022-01-01 00:00:00,13,12
...,...,...,...
66863,2022-01-31 23:00:00,261,4
66864,2022-01-31 23:00:00,262,8
66865,2022-01-31 23:00:00,263,26
66866,2022-01-31 23:00:00,264,24


In [15]:
# for instances of locaiton and hour where there were no rides, they do not appear above, so we want to place them with zeros
import tqdm  # adds loading-bar for for-loops

def add_missing_slots(agg_rides):
    location_ids = agg_rides["pickup_location_id"].unique()  # returns array of unqiue pickup-location-ids
    # date-time-index object representing complete range of hourly time slots from min to max pickup-hour in agg-rides
    full_range = pd.date_range(agg_rides["pickup_hour"].min(), agg_rides["pickup_hour"].max(), freq="H")
    # an empty df to store the final result. 
    output = pd.DataFrame()
    
    # iterate throughn every unqiue location-id
    for location_id in tqdm(location_ids):
        # if df.attribute is equal to cur-location-id, loc selects teh specified columns, aslong as the conditions is met
        # first filters rows by if its equal to cur-location-id, then selects the two columns 
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location_id, ["pickup_hour","rides"]]  # return is the filtered data-frame

        agg_rides_i.set_index("pickup_hour", inplace=True) # sets the pickup-hour-col as the index of the df-agg-rides-i
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)  #setting the index-aattribute equal to DTI-obj
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0) # fills any missing values in agg-rides-i with 0
        agg_rides_i["pickup_location_id"] = location_id  # adding new column equal to cur-location-id

        output = pd.concat[output, agg_rides_i]  # mergs cur-agg-rides-i

    output = output.reset_index().rename(columns={"index":"pickup_hour"}) # modifies output-df by restting its index and renaming the newly created column

    return output


