In [5]:
import pandas as pd

ts_data = pd.read_parquet("../data/transformed/ts_data_2022_01.parquet")  # read in parquet-file specified in path returns data-frame
ts_data

Unnamed: 0,pickup_hour,num_of_rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
191203,2022-01-31 19:00:00,0,176
191204,2022-01-31 20:00:00,0,176
191205,2022-01-31 21:00:00,0,176
191206,2022-01-31 22:00:00,0,176


In [6]:
# trying transformation for 1 location-43
# ts_data.pickup_location_id == 43: creates a boolean mask where each row is checked to see if its pcikup-location-id column is 43
# loc[]: is used to select rows based on the boolean mask, : means all columns for the rows where the pickup-location-id is 43 are selected
# reset_index(): resets the index of the df, when a df is filtered, the original row indices are retained, this method reassignss the row indices to start from 0
# drop=ture: means the old-index is not added as a column in the new df.
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 43, :].reset_index(drop=True)  # the filtered and reindexed df is stored in var
ts_data_one_location.head(15)

Unnamed: 0,pickup_hour,num_of_rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [7]:
# given data-frame, number of features for model, and stpe-size
def get_cutoff_indices(data, n_features, step_size): 
    stop_position = len(data)-1  # stop-pos is last-index-row

    subseq_first_idx = 0            # starting index init at 0th index
    subseq_mid_idx = n_features     # mid-index is the number of features-index-row
    subseq_last_idx = n_features+1  # last-index is one after that which is target
    indicies = []                   # stores triplets of indices for each exmaple (first, mid, last) where first to mid is the features, mid to last is the targets

    # while the last-index has not reached end of df
    while subseq_last_idx <= stop_position:
        # add triplet of indices (a,b,c), a to b is features, b to c is target
        indicies.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))

        subseq_first_idx += step_size  # update all indices by step-size, to move to and collect next example
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indicies


In [8]:
# sample to test get-cutoff-function
n_features = 24
step_size = 1

indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)

indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [9]:
# implement slicing using these indicies
import numpy as np

n_examples = len(indices)   # number of examples is number of triplets in indicies-arr
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32) # creating x-train-data create numpy-arr of shape where each row is an example and each col is a feature, empty initlize array
y = np.ndarray(shape=(n_examples), dtype=np.float32)             # creating y-train-label where shape is 1D-arr, where each element is label for 1 example
pickup_hours = []

# iterate every triplet-index-group
for i, idx in enumerate(indices):
    # set training-x-data's ith row equal to, slice dataframe from first-indx in group to mid-index in cur-group in the num-of-rides col and get its values/features for that example
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]["num_of_rides"].values
    # set train-y-data's ith element equal to, slice dataframe from mid-indx to last-indx in cur-group in the num-of-rides col and get teh target-value, for the above features
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["num_of_rides"].values
    # get the row at position idx[1], select pickup-hour-col and add it to list
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]["pickup_hour"])

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]["num_of_rides"].values


In [10]:
print(f"{x.shape}")  # shape of training-data
print(f"{x=}")
print(f"{pickup_hours[:5]=}")

(719, 24)
x=array([[ 97.,  60.,  22., ...,  16.,  18.,   6.],
       [ 60.,  22.,   8., ...,  18.,   6.,   3.],
       [ 22.,   8.,   6., ...,   6.,   3.,   1.],
       ...,
       [ 28.,  16.,  13., ..., 102.,  66.,  61.],
       [ 16.,  13.,   8., ...,  66.,  61.,  73.],
       [ 13.,   8.,   1., ...,  61.,  73.,  33.]], dtype=float32)
pickup_hours[:5]=[Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


In [11]:
# convert numpy-data-x into data-frame
# columns argument specidies the column names for the dataframe, iterate the feature-indicies, create a columns for each the number of rides of that hour 
features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
features_one_location
# each row is an example, each col is a feature

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,97.0,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,...,70.0,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0
1,60.0,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,...,94.0,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0
2,22.0,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,...,87.0,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0
3,8.0,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,...,73.0,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0
4,6.0,5.0,3.0,10.0,7.0,19.0,24.0,39.0,35.0,77.0,...,34.0,32.0,22.0,16.0,18.0,6.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,52.0,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,...,78.0,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0
715,36.0,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,...,74.0,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0
716,28.0,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,...,66.0,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0
717,16.0,13.0,8.0,1.0,1.0,2.0,1.0,1.0,4.0,9.0,...,91.0,117.0,100.0,106.0,147.0,121.0,102.0,66.0,61.0,73.0


In [12]:
# convert numpy-data-y-arr into data-frame
# columns argument specifies the column names for the dataframe, which is just 1 the number of rides in the next hour, because its 1D-list
targets_one_location = pd.DataFrame(y, columns=[f"target_rides_next_hour"])
targets_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
714,66.0
715,61.0
716,73.0
717,33.0


In [13]:
from tqdm import tqdm
# NOTE: try to understand previous objects above to understand this

# MY CODE DOESN'T WORK FOR SOME REASON, BUT IT HAS THE COMMENTS
def transform_ts_data_into_features_and_target(ts_data, input_seq_len, step_size):

    location_ids = ts_data["pickup_location_id"].unique()  # get the unqiue locaiton-ids in the columns
    features = pd.DataFrame()  # create empty df
    targets = pd.DataFrame()   # create empty df

    for location_id in tqdm(location_ids):
        # for all the rows in ts-data-df that have pickup-loc-id equal to cur-location-id select its pickup-hour adn num-of-rides columns
        ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id, ["pickup_hour", "num_of_rides"]]

        # get list of indicies-triplet for cur-location passing ts-dataframe for cur-location and input-size 
        indices = get_cutoff_indices(ts_data_one_location, input_seq_len, step_size) 

        n_examples = len(indices)  # get number of examples
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)  # initlize empty array of size (examples, input-nodes) for train-x-data
        y = np.ndarray(shape=(n_examples), dtype=np.float32)                 # initlize empty array 1D of size examples for train-y-data
        pickup_hours = []

        # iterate through all triplet-indices for cur location which we can get all of the examples (features and target) per triplet group
        for i, idx in enumerate(indices):
            # set ith row in x-dataframe equal to the cur-ts-loc-data sliced from start-indx to mid-indx of cur-idx-triplet, get the num-of-rides column, gets features
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]["num_of_rides"].values
            # set ith row of y-dataframe equal to cur-ts-loc-data sliced from mid-indx to last-indx of cur-idx-triplet, get the num-of-rides column gets targets
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
            # get the row at position idx[1] of cur-loc-dataframe, select pickup-hour-col and add it to pickup-horus list, this is done for every example-indicies-group for every location 
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        # convert x-numpy-arr to data-frame specifying the columns-names of df, iterating through all feature-input-node-indiceis, and creating column in dataframe for each
        features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))])
        # set the pickup-hour-col for cur-location-feature-dataframe equal to the pickup-hours-list
        features_one_location["pickup_hour"] = pickup_hours
        # set the pickup_location_id-col of cur-location-feature-dataframe equal to the cur-location-id
        features_one_location["pickup_location_id"] = location_id

        # convert y-numpy-arr to data-frame specifying the columns-name of df which is just one
        targets_one_location = pd.DataFrame(y, columns=[f"target_rides_next_hour"])

        # concatenate/add the features for cur-location to features-total-df, same for targets
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    # return features-dataframe and targets-column
    return features, targets["target_rides_next_hour"]


In [14]:
features, targets = transform_ts_data_into_features_and_target(ts_data, input_seq_len=24*7+1, step_size=24)

print(f"features: {features.shape=}")  # (examples, features)
print(f"targets: {targets.shape=}")     # (examples)

  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['num_of_rides'].values
  y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['

features: features.shape=(6168, 171)
targets: targets.shape=(6168,)



