In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from utils.data_utils import get_full_df

In [17]:
SEED = 117
TOTAL_DAYS = 400
START_POI = 0
END_POI = 400
TRAIN_RATIO = 0.7
VALID_RATIO = 0.2
TEST_RATIO = 1 - (TRAIN_RATIO + VALID_RATIO)
DATASET = 'Houston'
START_DATE = '2018-12-31'
WINDOW_SIZE = 24
HORIZON = 6

csv_path = f'/home/users/arash/datasets/safegraph/weekly_patterns_2019-01-07_2020-06-08_{DATASET}.csv'
poi_info_csv_path = '/home/users/arash/datasets/safegraph/core_poi_info_2019-01-07_2020-06-08.csv'


In [3]:
df = get_full_df(csv_path_weekly=csv_path, 
                         poi_info_csv_path=poi_info_csv_path, 
                         start_row=START_POI, end_row=END_POI, 
                         total_days=TOTAL_DAYS,
                         city=DATASET)

core_poi-part2.csv
core_poi-part5.csv
core_poi-part4.csv
core_poi-part3.csv
core_poi-part1.csv


In [4]:
data = pd.DataFrame(df["visits_by_each_hour"].to_list()).T

In [5]:
date_format = r'%Y-%m-%d'
start_time = datetime.strptime(START_DATE, date_format)
end_time = start_time + timedelta(days=TOTAL_DAYS)
time_span = pd.date_range(start_time, end_time, freq='1H', closed='left').to_numpy()
time_span

array(['2018-12-31T00:00:00.000000000', '2018-12-31T01:00:00.000000000',
       '2018-12-31T02:00:00.000000000', ...,
       '2020-02-03T21:00:00.000000000', '2020-02-03T22:00:00.000000000',
       '2020-02-03T23:00:00.000000000'], dtype='datetime64[ns]')

In [12]:
data = data.set_index(time_span)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
2018-12-31 00:00:00,83,21,6,15,36,2,27,32,7,6,...,1,1,1,0,0,0,0,0,1,0
2018-12-31 01:00:00,49,23,5,18,9,1,5,6,6,9,...,0,0,1,0,0,0,0,0,0,0
2018-12-31 02:00:00,41,21,12,16,11,2,4,7,4,5,...,0,0,1,0,0,0,1,0,0,0
2018-12-31 03:00:00,60,14,4,13,20,3,10,24,3,9,...,1,0,0,0,2,0,0,0,0,0
2018-12-31 04:00:00,187,52,7,47,71,5,58,64,7,31,...,0,0,6,0,0,0,0,0,0,2


In [15]:
forecasts = data.shift(7, freq="1D")
forecasts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
2019-01-07 00:00:00,83,21,6,15,36,2,27,32,7,6,...,1,1,1,0,0,0,0,0,1,0
2019-01-07 01:00:00,49,23,5,18,9,1,5,6,6,9,...,0,0,1,0,0,0,0,0,0,0
2019-01-07 02:00:00,41,21,12,16,11,2,4,7,4,5,...,0,0,1,0,0,0,1,0,0,0
2019-01-07 03:00:00,60,14,4,13,20,3,10,24,3,9,...,1,0,0,0,2,0,0,0,0,0
2019-01-07 04:00:00,187,52,7,47,71,5,58,64,7,31,...,0,0,6,0,0,0,0,0,0,2


In [18]:
train_days = int(TRAIN_RATIO * TOTAL_DAYS)
valid_days = int(TOTAL_DAYS * VALID_RATIO)
test_days = TOTAL_DAYS - train_days - valid_days
valid_dates = time_span[train_days*24 + WINDOW_SIZE:(train_days + valid_days)*24 - HORIZON + 1]
valid_dates

array(['2019-10-08T00:00:00.000000000', '2019-10-08T01:00:00.000000000',
       '2019-10-08T02:00:00.000000000', ...,
       '2019-12-25T16:00:00.000000000', '2019-12-25T17:00:00.000000000',
       '2019-12-25T18:00:00.000000000'], dtype='datetime64[ns]')

In [22]:
forecasts.loc[valid_dates]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
2019-10-08 00:00:00,69,14,10,6,23,0,16,22,4,7,...,0,0,0,2,1,0,0,0,1,0
2019-10-08 01:00:00,25,3,9,3,8,2,6,5,10,6,...,0,0,1,0,0,0,1,0,0,0
2019-10-08 02:00:00,24,3,5,3,12,0,5,5,2,4,...,0,0,1,0,0,0,0,0,0,0
2019-10-08 03:00:00,78,35,7,33,13,0,8,17,5,11,...,0,0,0,0,0,0,0,0,1,0
2019-10-08 04:00:00,121,40,9,37,71,5,58,20,7,27,...,1,1,4,0,0,0,2,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-25 14:00:00,376,151,34,144,130,21,92,52,29,73,...,0,2,0,1,0,0,0,0,0,0
2019-12-25 15:00:00,375,174,52,169,136,27,103,53,25,64,...,0,6,0,0,0,0,0,0,1,0
2019-12-25 16:00:00,459,198,27,190,130,16,99,55,15,104,...,0,0,0,0,0,0,0,0,2,0
2019-12-25 17:00:00,292,118,22,110,70,17,54,24,10,82,...,0,0,0,0,2,0,0,0,0,0


In [29]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
2018-12-31 00:00:00,83,21,6,15,36,2,27,32,7,6,...,1,1,1,0,0,0,0,0,1,0
2018-12-31 01:00:00,49,23,5,18,9,1,5,6,6,9,...,0,0,1,0,0,0,0,0,0,0
2018-12-31 02:00:00,41,21,12,16,11,2,4,7,4,5,...,0,0,1,0,0,0,1,0,0,0
2018-12-31 03:00:00,60,14,4,13,20,3,10,24,3,9,...,1,0,0,0,2,0,0,0,0,0
2018-12-31 04:00:00,187,52,7,47,71,5,58,64,7,31,...,0,0,6,0,0,0,0,0,0,2


In [32]:
historical_avg_df = data.shift(7, freq='1D')
historical_avg_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
2019-01-07 00:00:00,83,21,6,15,36,2,27,32,7,6,...,1,1,1,0,0,0,0,0,1,0
2019-01-07 01:00:00,49,23,5,18,9,1,5,6,6,9,...,0,0,1,0,0,0,0,0,0,0
2019-01-07 02:00:00,41,21,12,16,11,2,4,7,4,5,...,0,0,1,0,0,0,1,0,0,0
2019-01-07 03:00:00,60,14,4,13,20,3,10,24,3,9,...,1,0,0,0,2,0,0,0,0,0
2019-01-07 04:00:00,187,52,7,47,71,5,58,64,7,31,...,0,0,6,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-02-10 19:00:00,306,156,82,153,131,86,89,26,32,28,...,5,1,13,5,4,0,5,5,9,1
2020-02-10 20:00:00,287,133,57,124,109,35,74,27,15,18,...,0,7,6,1,2,0,1,2,1,0
2020-02-10 21:00:00,220,87,34,84,86,10,59,40,17,20,...,2,0,1,3,1,0,3,4,2,0
2020-02-10 22:00:00,152,42,9,38,31,12,11,49,7,7,...,2,0,3,0,0,0,0,1,0,0


In [36]:
valid_dates

array(['2019-10-08T00:00:00.000000000', '2019-10-08T01:00:00.000000000',
       '2019-10-08T02:00:00.000000000', ...,
       '2019-12-25T16:00:00.000000000', '2019-12-25T17:00:00.000000000',
       '2019-12-25T18:00:00.000000000'], dtype='datetime64[ns]')

In [50]:
new_df = data.shift(7, freq='1D')
new_df.loc[valid_dates[:1]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
2019-10-08,69,14,10,6,23,0,16,22,4,7,...,0,0,0,2,1,0,0,0,1,0


In [85]:
num_weeks = 3
for timestamp in valid_dates:
    for week in range(1, num_weeks):
        new_df.loc[timestamp] += new_df.loc[timestamp - np.timedelta64(7*week, 'D')]
print((new_df/num_weeks).round().astype('int'))

                     0    1    2    3    4    5    6    7    8    9    ...  \
2019-01-07 00:00:00   28    7    2    5   12    1    9   11    2    2  ...   
2019-01-07 01:00:00   16    8    2    6    3    0    2    2    2    3  ...   
2019-01-07 02:00:00   14    7    4    5    4    1    1    2    1    2  ...   
2019-01-07 03:00:00   20    5    1    4    7    1    3    8    1    3  ...   
2019-01-07 04:00:00   62   17    2   16   24    2   19   21    2   10  ...   
...                  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
2020-02-10 19:00:00  102   52   27   51   44   29   30    9   11    9  ...   
2020-02-10 20:00:00   96   44   19   41   36   12   25    9    5    6  ...   
2020-02-10 21:00:00   73   29   11   28   29    3   20   13    6    7  ...   
2020-02-10 22:00:00   51   14    3   13   10    4    4   16    2    2  ...   
2020-02-10 23:00:00   30   11    5    9   15    1   10    7    2    2  ...   

                     390  391  392  393  394  395  396  397  39

In [48]:
timestamp = valid_dates[0]
print(timestamp)
data.loc[timestamp - np.timedelta64(7, 'D')]

2019-10-08T00:00:00.000000000


0      69
1      14
2      10
3       6
4      23
       ..
395     0
396     0
397     0
398     1
399     0
Name: 2019-10-01 00:00:00, Length: 400, dtype: int64