## Import Necessary Libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

from dask.distributed import Client, LocalCluster
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import SGDRegressor

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## Loading data

In [7]:
df = pd.read_csv('../../3_Data/processed/2025_hourly_all_clean.csv')
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_demand,taxi_demand
0,2025-01-01 00:00:00,9132,7344
1,2025-01-01 01:00:00,8996,8468
2,2025-01-01 02:00:00,7364,7257
3,2025-01-01 03:00:00,4904,4915
4,2025-01-01 04:00:00,3015,2918


In [8]:
df.tail()

Unnamed: 0,tpep_pickup_datetime,passenger_demand,taxi_demand
6566,2025-09-30 19:00:00,9595,9779
6567,2025-09-30 20:00:00,8882,9539
6568,2025-09-30 21:00:00,9048,9965
6569,2025-09-30 22:00:00,7026,8001
6570,2025-09-30 23:00:00,3984,4587


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6571 entries, 0 to 6570
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tpep_pickup_datetime  6571 non-null   object
 1   passenger_demand      6571 non-null   int64 
 2   taxi_demand           6571 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 154.1+ KB


In [10]:
df.describe()

Unnamed: 0,passenger_demand,taxi_demand
count,6571.0,6571.0
mean,5407.537513,5449.31152
std,3190.897504,3055.996053
min,0.0,0.0
25%,2208.5,2574.5
50%,6037.0,6016.0
75%,7957.0,7582.0
max,12923.0,14039.0


In [15]:
df.loc[0, 'tpep_pickup_datetime'].weekofyear

1

In [16]:
df.loc[0, 'tpep_pickup_datetime'].dayofyear

1

In [18]:
# let's set "tpep_pickup_datetime" as index
data = df.set_index('tpep_pickup_datetime').copy()
data

Unnamed: 0_level_0,passenger_demand,taxi_demand
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-01-01 00:00:00,9132,7344
2025-01-01 01:00:00,8996,8468
2025-01-01 02:00:00,7364,7257
2025-01-01 03:00:00,4904,4915
2025-01-01 04:00:00,3015,2918
...,...,...
2025-09-30 19:00:00,9595,9779
2025-09-30 20:00:00,8882,9539
2025-09-30 21:00:00,9048,9965
2025-09-30 22:00:00,7026,8001


In [19]:
data.index.month

Index([1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       ...
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9],
      dtype='int32', name='tpep_pickup_datetime', length=6571)

In [20]:
def add_timed_features(df: pd.DataFrame) -> pd.DataFrame:
    df['hour'] = df.index.hour
    df['day'] = df.index.day
    df['month'] = df.index.month
    df['dayofweek'] = df.index.dayofweek
    return df

In [21]:
add_timed_features(data)

Unnamed: 0_level_0,passenger_demand,taxi_demand,hour,day,month,dayofweek
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-01 00:00:00,9132,7344,0,1,1,2
2025-01-01 01:00:00,8996,8468,1,1,1,2
2025-01-01 02:00:00,7364,7257,2,1,1,2
2025-01-01 03:00:00,4904,4915,3,1,1,2
2025-01-01 04:00:00,3015,2918,4,1,1,2
...,...,...,...,...,...,...
2025-09-30 19:00:00,9595,9779,19,30,9,1
2025-09-30 20:00:00,8882,9539,20,30,9,1
2025-09-30 21:00:00,9048,9965,21,30,9,1
2025-09-30 22:00:00,7026,8001,22,30,9,1
